Prometheus Alertmanager and Ntfy

By hernil

So this is a very minimal setup of the following services managed with docker-compose. It does not contain any actual monitoring or alerts outside a very basic polling of Traefik metrics. That will come later. Ntfy is hosted outside this setup and simply assumed available on an endpoint.

Some of the labels assume a Traefik setup for exposing containers. Adapt to your own needs.

Setup

Prometheus -> Alertmanager -> ntfy-alertmanager -> Ntfy

File structure

monitoring
├── alertmanager
│   └── alertmanager.yml
├── ntfy_alertmananger_config
│   └── config
├── prometheus
│   └── prometheus.yml
└── docker-compose.yml

The files

alertmanager/alertmanager.yml

route:
  receiver: 'ntfy'
  repeat_interval: 4h
  group_by: [ alertname ]

receivers:
  - name: "ntfy"
    webhook_configs:
      - url: "http://ntfy_alertmanager_proxy:8080"
        http_config:
          basic_auth:
            username: "webhookUser"
            password: "webhookPass"

ntfy_alertmananger_config/config

http-address :8080
base-url https://na-proxy.example.com
log-level debug
alert-mode single
user webhookUser
password webhookPass

labels {
    order "severity"

    severity "critical" {
        priority 4
    }

    severity "warning" {
        priority 3
    }

    severity "info" {
        priority 1
    }
}

resolved {
    tags "resolved"
}

ntfy {
    topic https://ntfy.example.com/alertmanager
}

alertmanager {
    silence-duration 24h
    url https://alertmanager.example.com
    user webhookUser
    password webhookPass
}

cache {
    type redis
    duration 24h

    redis-url redis://redis:6379/10
}

prometheus/prometheus.yml

global:
  scrape_interval:     15s
  evaluation_interval: 15s
  external_labels:
      monitor: 'my-project'

rule_files:
  - 'alert.*.rules'

scrape_configs:
  - job_name: 'prometheus'
    scrape_interval: 5s
    static_configs:
         - targets: ['localhost:9090']

  - job_name: traefik
    metrics_path: /metrics
    static_configs:
      - targets:
        - traefik:8080
alerting:
  alertmanagers:
    - scheme: http
      static_configs:
        - targets: [ 'alertmanager:9093' ]

docker-compose.yml

version: "3.8"
networks:
  web: # Traefik exposed network
    external: true
  stats: # Network for other resources or application stacks to share stats
    external: true
  internal: # Internal to these services
    external: false

volumes:
  alertmanager_data:
  prometheus_data:

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: "prometheus"
    user: root
    volumes:
      - ./prometheus/:/etc/prometheus/
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.size=256MB'
    labels:
      - "traefik.http.routers.prometheus.rule=Host(`prometheus.example.com`)"
      - "traefik.http.routers.prometheus.tls=true"
      - "traefik.http.routers.prometheus.tls.certresolver=lets-encrypt"
    networks:
      - web
      - stats
    restart: unless-stopped

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    restart: unless-stopped
    expose:
      - 9093
    volumes:
      - ./alertmanager:/config
      - alertmanager_data:/data
    command: --config.file=/config/alertmanager.yml --log.level=debug
    restart: unless-stopped
    networks:
      - web
    labels:
      - "traefik.http.routers.alertmanager.rule=Host(`alertmanager.example.com`)"
      - "traefik.http.routers.alertmanager.tls=true"
      - "traefik.http.routers.alertmanager.tls.certresolver=lets-encrypt"

  ntfy-alertmanager:
    image: xenrox/ntfy-alertmanager:latest
    container_name: ntfy_alertmanager_proxy
    volumes:
      - ./ntfy_alertmanager_config:/etc/ntfy-alertmanager
    expose:
      - 8080
    restart: unless-stopped
    networks:
      - web
      - internal
    labels:
      - "traefik.http.routers.naproxy.rule=Host(`na-proxy.example.com`)"
      - "traefik.http.routers.naproxy.tls=true"
      - "traefik.http.routers.naproxy.tls.certresolver=lets-encrypt"

  # ntfy-alertmananger cache
  redis:
    image: redis:alpine
    container_name: redis
    restart: unless-stopped
    volumes:
      - ./cache:/data
    networks:
      - internal

Future work

  • Ingest ZFS and/or Sanoid metrics to Prometheus
  • Examine if Kopia metrics can be integrated
  • Set up apropriate alerts for the above and Traefik

Sources

https://medium.com/javarevisited/monitoring-setup-with-docker-compose-part-3-alertmanager-5d0a2d4a5612

https://git.xenrox.net/~xenrox/ntfy-alertmanager


Input or feedback to this content? Reply via email!