Homelab: Monitoring introduction

As you might have guest in my previous log, I'm obsessed with outages.

There are two ways to detect and outage:

  • Have patient users
  • Have a monitoring system

Let's be honest, I'm not patient.

In France, we have a saying:

Charité bien ordonné commence par soi-même

(Charity begins at home)

Let's start with the LGTM stack.

Hopefully NixOS supports it well.

We can start with Prometheus (which exposes endpoints with metrics):

services.prometheus = {
  port = 3020;
  enable = true;

  exporters = {
    node = {
      port = 3021;
      enabledCollectors = [ "systemd" ];
      enable = true;
    };
  };

  scrapeConfigs = [{
    job_name = "nodes";
    static_configs = [{
      targets = [
        "127.0.0.1:${toString config.services.prometheus.exporters.node.port}"
      ];
    }];
  }];
};

Then Loki, which receives logs and exposes an endpoint with logs:

services.loki = {
  enable = true;
  configuration = {
    server.http_listen_port = 3030;
    auth_enabled = false;

    ingester = {
      lifecycler = {
        address = "127.0.0.1";
        ring = {
          kvstore = {
            store = "inmemory";
          };
          replication_factor = 1;
        };
      };
      chunk_idle_period = "1h";
      max_chunk_age = "1h";
      chunk_target_size = 999999;
      chunk_retain_period = "30s";
      max_transfer_retries = 0;
    };

    schema_config = {
      configs = [{
        from = "2022-06-06";
        store = "boltdb-shipper";
        object_store = "filesystem";
        schema = "v11";
        index = {
          prefix = "index_";
          period = "24h";
        };
      }];
    };

    storage_config = {
      boltdb_shipper = {
        active_index_directory = "/var/lib/loki/boltdb-shipper-active";
        cache_location = "/var/lib/loki/boltdb-shipper-cache";
        cache_ttl = "24h";
        shared_store = "filesystem";
      };

      filesystem = {
        directory = "/var/lib/loki/chunks";
      };
    };

    limits_config = {
      reject_old_samples = true;
      reject_old_samples_max_age = "168h";
    };

    chunk_store_config = {
      max_look_back_period = "0s";
    };

    table_manager = {
      retention_deletes_enabled = false;
      retention_period = "0s";
    };

    compactor = {
      working_directory = "/var/lib/loki";
      shared_store = "filesystem";
      compactor_ring = {
        kvstore = {
          store = "inmemory";
        };
      };
    };
  };
};

And Promtail to push logs to Loki:

services.promtail = {
  enable = true;
  configuration = {
    server = {
      http_listen_port = 3031;
      grpc_listen_port = 0;
    };
    positions = {
      filename = "/tmp/positions.yaml";
    };
    clients = [{
      url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}/loki/api/v1/push";
    }];
    scrape_configs = [{
      job_name = "journal";
      journal = {
        max_age = "31d";
        labels = {
          job = "systemd-journal";
          host = "barracuda";
        };
      };
      relabel_configs = [{
        source_labels = [ "__journal__systemd_unit" ];
        target_label = "unit";
      }];
    }];
  };
};

Finally Grafana to have an UI:

services.grafana = {
  enable = true;

  settings = {
    analytics.reporting_enable = false;
    server = {
      root_url = "https://monitoring.barracuda.local";
      http_port = 3010;
      http_addr = "127.0.0.1";
      protocol = "http";
    };
    security = {
      admin_user = "admin";
      admin_password = "admin";
    };
  };

  provision = {
    enable = true;
    datasources.settings.datasources = [
      {
        uid = "cd47d5c1-f607-418e-a51c-080cbf31fefb";
        name = "Local Prometheus";
        type = "prometheus";
        access = "proxy";
        url = "http://127.0.0.1:${toString config.services.prometheus.port}";
      }
      {
        uid = "3d92e8be-db1e-4808-ac93-c96c61f01fe0";
        name = "Local Loki";
        type = "loki";
        access = "proxy";
        url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
      }
    ];
  };
};

Note that, I have setup random UUIDs (well, ViM did) to make easier to provision dashboards and alerting (in follow-ups).