Homelab: Monitoring alerting through AlertManager
Previously, we have set alerting through Grafana alerting, but, if you noticed, I did not expose my alerts more than:
services.grafana.provision.alerting = {
rules = import ./monitoring/alert-rules.nix {
inherit pkgs config prometheusGrafanaDataSourceUid lokiGrafanaDataSourceUid;
};
};
I did it on purpose, as Grafana alerting seems to be mostly designed for GUI/WebUI usage, such as checking RAM usage take 75 lines in a roughly well formatted layout, and the core of the alert looks like that:
{
refId = "RAM load";
relativeTimeRange = { from = 3600; to = 0; };
datasourceUid = prometheusGrafanaDataSourceUid;
model = {
datasource = { type = "prometheus"; uid = prometheusGrafanaDataSourceUid; };
editorMode = "code";
expr = "clamp_min((1-(node_memory_MemAvailable_bytes/(node_memory_MemTotal_bytes- 4*10^9))), 0)";
hide = false;
instant = false;
interval = "";
intervalMs = 15000;
legendFormat = "RAM load";
maxDataPoints = 43200;
range = true;
refId = "RAM load";
};
}
{
refId = "Reduce";
relativeTimeRange = { from = 600; to = 0; };
datasourceUid = "__expr__";
model = {
conditions = [{
evaluator = { params = [ ]; type = "gt"; };
operator = { type = "and"; };
query = { params = [ "A" ]; };
reducer = { params = [ ]; type = "last"; };
type = "query";
}];
datasource = { type = "__expr__"; uid = "__expr__"; };
expression = "RAM load";
hide = false;
intervalMs = 1000;
maxDataPoints = 43200;
reducer = "last";
refId = "Reduce";
type = "reduce";
};
}
{
refId = "Threshold";
relativeTimeRange = { from = 600; to = 0; };
datasourceUid = "__expr__";
model = {
conditions = [{
evaluator = { params = [ 0.5 ]; type = "gt"; };
operator = { type = "and"; };
query = { params = [ "B" ]; };
reducer = { params = [ ]; type = "last"; };
type = "query";
}];
datasource = { type = "__expr__"; uid = "__expr__"; };
expression = "Reduce";
hide = false;
intervalMs = 1000;
maxDataPoints = 43200;
refId = "Threshold";
type = "threshold";
};
}
Not really readable.
On another hand, I came across Awesome Prometheus Alerts which is a collection of alertmanager (which is deployed alongside to Prometheus).
To highlight the difference, here's the same rule expressed for alertmanager:
{
alert = "HostOutOfMemory";
for = "5m";
expr = ''(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = "Warning: RAM usage is > 20%";
}
7 lines, moreover, the query (in expr
) is a bit more complex,
because it takes care of multi-node deployment, even though it's not useful.
My whole alerting rules looks like this:
services.prometheus.rules = map builtins.toJSON [{
groups = [
{
name = "barracuda";
rules = [
{
alert = "HostOutOfMemory";
for = "5m";
expr = ''(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = "Warning: RAM usage is > 20%";
}
{
alert = "HostOutOfDiskSpaceRoot";
for = "5m";
expr = ''((node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 50 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = ''Warning: Root filesystem ("/") is going full (> 50%)'';
}
{
alert = "HostOutOfDiskSpaceBoot";
for = "5m";
expr = ''((node_filesystem_avail_bytes{mountpoint="/boot"} * 100) / node_filesystem_size_bytes{mountpoint="/boot"} < 50 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = ''Warning: Boot filesystem ("/boot") is going full (> 50%)'';
}
{
alert = "HostHighCpuLoad";
for = "1m";
expr = ''(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.2) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = "Warning: CPU is unusally overloaded (> 20%)";
}
{
alert = "HostSystemdServiceCrashed";
for = "0m";
expr = ''(node_systemd_unit_state{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = "Warning: systemd service crashed ({{ $value }})";
}
{
alert = "HostNodeOvertemperatureAlarm";
for = "5m";
expr = ''((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 50)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = "Warning: temperature is too high";
}
];
}
{
name = "monitoring";
rules = [
{
alert = "PromtailRequestErrors";
expr = ''100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'';
for = "5m";
labels.severity = "critical";
annotations.summary = "Promtail request errors";
}
{
alert = "PromtailRequestLatency";
expr = ''histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1'';
for = "5m";
labels.severity = "critical";
annotations.summary = "Promtail request latency";
}
{
alert = "LokiProcessTooManyRestarts";
expr = ''changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2'';
for = "0m";
labels.severity = "warning";
annotations.summary = "Loki process too many restarts";
}
{
alert = "LokiRequestErrors";
expr = ''100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'';
for = "15m";
labels.severity = "critical";
annotations.summary = "Loki request errors";
}
{
alert = "LokiRequestPanic";
expr = ''sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'';
for = "5m";
labels.severity = "critical";
annotations.summary = "Loki request panic";
}
{
alert = "LokiRequestLatency";
expr = ''(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'';
for = "5m";
labels.severity = "critical";
annotations.summary = "Loki request latency";
}
];
}
{
name = "services";
interval = "1h";
rules = [
{
alert = "ResticCheckFailed";
for = "0m";
expr = ''restic_check_success == 0'';
labels.severity = "warning";
annotations.summary = ''Warning: Restic check failed of "{{ $labels.snapshot_paths }}"'';
}
{
alert = "ResticOutdatedBackupLoopingNixosConfig";
for = "0m";
expr = ''time() - restic_backup_timestamp{snapshot_paths="/etc/nixos"} > 435600'';
labels.severity = "critical";
annotations.summary = ''Critical: Restic last backup of "/etc/nixos" is older than 5 days (and one hour)'';
}
{
alert = "ResticOutdatedBackupLoopingHome";
for = "0m";
expr = ''time() - restic_backup_timestamp{snapshot_paths="/home/black"} > 1299600'';
labels.severity = "critical";
annotations.summary = ''Critical: Restic last backup of "/home/black" is older than 15 days (and one hour)'';
}
];
}
{
name = "hannibal";
rules = [
{
alert = "HostOutOfDiskSpaceHannibal";
for = "5m";
expr = ''((node_filesystem_avail_bytes{mountpoint="/mnt/hannibal_medias"} * 100) / node_filesystem_size_bytes{mountpoint="/mnt/hannibal_medias"} < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'';
labels.severity = "warning";
annotations.summary = ''Warning: Hannibal is going full (> 80%)'';
}
];
}
];
}];
Which is 16 rules in 137 lines (roughly 2 Grafana alerts).
Once we have our rules set, we have to redirect them to IRC:
services.prometheus.alertmanager = {
enable = true;
port = 3041;
configuration = {
global = { };
route = {
receiver = "local-irc";
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
group_by = [ "alertname" ];
routes = [
{
receiver = "local-irc";
}
];
};
receivers = [
{
name = "local-irc";
webhook_configs = [
{
url = "http://127.0.0.1:toString config.services.prometheus.alertmanagerIrcRelay.settings.http_port/barracuda-monitoring";
send_resolved = true;
}
];
}
];
};
};
Here we are, a fully setup and monitored infrastructure.