summaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-01-08 21:41:29 +0200
committerPaul Buetow <paul@buetow.org>2026-01-08 21:41:29 +0200
commita10cbd4e27d944464cec88aaf49d8b8c354d26e1 (patch)
tree0bdd0a23fbb8939c15544b857c74101cb5721a6c /internal
parentf5cffe240c44045684d4f74981235b060828550e (diff)
Add Prometheus alert scraping with configurable timeout and host failover
Diffstat (limited to 'internal')
-rw-r--r--internal/config.go8
-rw-r--r--internal/prometheus.go131
-rw-r--r--internal/prometheus_test.go159
-rw-r--r--internal/run.go1
4 files changed, 298 insertions, 1 deletions
diff --git a/internal/config.go b/internal/config.go
index 2ade802..c129ab7 100644
--- a/internal/config.go
+++ b/internal/config.go
@@ -20,7 +20,9 @@ type config struct {
CheckConcurrency int
StaleThreshold int `json:"StaleThreshold,omitempty"`
Federated []string `json:"Federated,omitempty"` // TODO: Document this option
- Checks map[string]check
+ PrometheusHosts []string `json:"PrometheusHosts,omitempty"`
+ PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"`
+ Checks map[string]check
}
func newConfig(configFile string) (config, error) {
@@ -60,6 +62,10 @@ func newConfig(configFile string) (config, error) {
conf.StaleThreshold = 3600 // Default to 1 hour
}
+ if conf.PrometheusTimeoutS == 0 {
+ conf.PrometheusTimeoutS = 2 // Default to 2 seconds
+ }
+
if !conf.HTMLDisable && conf.HTMLStatusFile == "" {
conf.HTMLStatusFile = "/var/www/htdocs/buetow.org/self/gogios/index.html"
log.Println("Set HTMLStatusFile to " + conf.HTMLStatusFile)
diff --git a/internal/prometheus.go b/internal/prometheus.go
new file mode 100644
index 0000000..1c06aaf
--- /dev/null
+++ b/internal/prometheus.go
@@ -0,0 +1,131 @@
+package internal
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "time"
+)
+
+type prometheusResponse struct {
+ Status string `json:"status"`
+ Data struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ } `json:"data"`
+}
+
+type prometheusAlert struct {
+ Labels map[string]string `json:"labels"`
+ Annotations map[string]string `json:"annotations"`
+ State string `json:"state"`
+}
+
+func mergePrometheusAlerts(ctx context.Context, state state, conf config) state {
+ if len(conf.PrometheusHosts) == 0 {
+ return state
+ }
+
+ timeout := time.Duration(conf.PrometheusTimeoutS) * time.Second
+ alerts, host, err := fetchPrometheusAlerts(ctx, conf.PrometheusHosts, timeout)
+ if err != nil {
+ log.Printf("Failed to fetch Prometheus alerts from any host: %v", err)
+ cs := checkResult{
+ name: "Prometheus alerts",
+ output: fmt.Sprintf("CRITICAL: %v", err),
+ epoch: time.Now().Unix(),
+ status: nagiosCritical,
+ }
+ state.update(cs)
+ return state
+ }
+
+ log.Printf("Fetched %d firing alerts from Prometheus host %s", len(alerts), host)
+
+ for _, alert := range alerts {
+ if alert.State != "firing" {
+ continue
+ }
+
+ name := fmt.Sprintf("Prometheus: %s", alert.Labels["alertname"])
+ severity := alert.Labels["severity"]
+ description := alert.Annotations["summary"]
+ if description == "" {
+ description = alert.Annotations["description"]
+ }
+ if description == "" {
+ description = "no description"
+ }
+
+ status := nagiosWarning
+ if severity == "critical" {
+ status = nagiosCritical
+ }
+
+ cs := checkResult{
+ name: name,
+ output: fmt.Sprintf("%s [%s]: %s", alert.Labels["alertname"], severity, description),
+ epoch: time.Now().Unix(),
+ status: status,
+ }
+ state.update(cs)
+ }
+
+ return state
+}
+
+func fetchPrometheusAlerts(ctx context.Context, hosts []string, timeout time.Duration) ([]prometheusAlert, string, error) {
+ var lastErr error
+
+ for _, host := range hosts {
+ alerts, err := fetchFromHost(ctx, host, timeout)
+ if err != nil {
+ log.Printf("Failed to fetch from Prometheus host %s: %v", host, err)
+ lastErr = err
+ continue
+ }
+ return alerts, host, nil
+ }
+
+ return nil, "", fmt.Errorf("all Prometheus hosts failed, last error: %w", lastErr)
+}
+
+func fetchFromHost(ctx context.Context, host string, timeout time.Duration) ([]prometheusAlert, error) {
+ url := fmt.Sprintf("http://%s/api/v1/alerts", host)
+
+ ctx, cancel := context.WithTimeout(ctx, timeout)
+ defer cancel()
+
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create request: %w", err)
+ }
+
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("request failed: %w", err)
+ }
+ defer func() { _ = resp.Body.Close() }()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read response: %w", err)
+ }
+
+ var promResp prometheusResponse
+ if err := json.Unmarshal(body, &promResp); err != nil {
+ return nil, fmt.Errorf("failed to parse response: %w", err)
+ }
+
+ if promResp.Status != "success" {
+ return nil, fmt.Errorf("prometheus returned status: %s", promResp.Status)
+ }
+
+ return promResp.Data.Alerts, nil
+}
diff --git a/internal/prometheus_test.go b/internal/prometheus_test.go
new file mode 100644
index 0000000..8d98e7f
--- /dev/null
+++ b/internal/prometheus_test.go
@@ -0,0 +1,159 @@
+package internal
+
+import (
+ "context"
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestFetchFromHost(t *testing.T) {
+ tests := []struct {
+ name string
+ response prometheusResponse
+ statusCode int
+ wantAlertCount int
+ wantErr bool
+ }{
+ {
+ name: "successful response with firing alerts",
+ response: prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{
+ {
+ Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"},
+ Annotations: map[string]string{"summary": "CPU usage is high"},
+ State: "firing",
+ },
+ {
+ Labels: map[string]string{"alertname": "DiskSpace", "severity": "warning"},
+ Annotations: map[string]string{"summary": "Disk space low"},
+ State: "firing",
+ },
+ },
+ },
+ },
+ statusCode: http.StatusOK,
+ wantAlertCount: 2,
+ wantErr: false,
+ },
+ {
+ name: "empty alerts",
+ response: prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{},
+ },
+ },
+ statusCode: http.StatusOK,
+ wantAlertCount: 0,
+ wantErr: false,
+ },
+ {
+ name: "server error",
+ response: prometheusResponse{},
+ statusCode: http.StatusInternalServerError,
+ wantAlertCount: 0,
+ wantErr: true,
+ },
+ {
+ name: "prometheus error status",
+ response: prometheusResponse{
+ Status: "error",
+ },
+ statusCode: http.StatusOK,
+ wantAlertCount: 0,
+ wantErr: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/api/v1/alerts" {
+ t.Errorf("unexpected path: %s", r.URL.Path)
+ }
+ w.WriteHeader(tt.statusCode)
+ _ = json.NewEncoder(w).Encode(tt.response)
+ }))
+ defer server.Close()
+
+ host := strings.TrimPrefix(server.URL, "http://")
+ alerts, err := fetchFromHost(context.Background(), host, 2*time.Second)
+
+ if tt.wantErr && err == nil {
+ t.Error("expected error, got nil")
+ }
+ if !tt.wantErr && err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+ if len(alerts) != tt.wantAlertCount {
+ t.Errorf("got %d alerts, want %d", len(alerts), tt.wantAlertCount)
+ }
+ })
+ }
+}
+
+func TestFetchPrometheusAlertsFailover(t *testing.T) {
+ callCount := 0
+
+ server1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ callCount++
+ time.Sleep(3 * time.Second) // exceed timeout
+ }))
+ defer server1.Close()
+
+ server2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ callCount++
+ resp := prometheusResponse{
+ Status: "success",
+ Data: struct {
+ Alerts []prometheusAlert `json:"alerts"`
+ }{
+ Alerts: []prometheusAlert{
+ {
+ Labels: map[string]string{"alertname": "Test"},
+ State: "firing",
+ },
+ },
+ },
+ }
+ _ = json.NewEncoder(w).Encode(resp)
+ }))
+ defer server2.Close()
+
+ hosts := []string{
+ strings.TrimPrefix(server1.URL, "http://"),
+ strings.TrimPrefix(server2.URL, "http://"),
+ }
+
+ alerts, host, err := fetchPrometheusAlerts(context.Background(), hosts, 2*time.Second)
+ if err != nil {
+ t.Errorf("unexpected error: %v", err)
+ }
+ if len(alerts) != 1 {
+ t.Errorf("got %d alerts, want 1", len(alerts))
+ }
+ if host != hosts[1] {
+ t.Errorf("got host %s, want %s", host, hosts[1])
+ }
+}
+
+func TestMergePrometheusAlertsNoHosts(t *testing.T) {
+ conf := config{PrometheusHosts: nil}
+ s := state{checks: make(map[string]checkState)}
+
+ result := mergePrometheusAlerts(context.Background(), s, conf)
+
+ if len(result.checks) != 0 {
+ t.Errorf("expected no checks, got %d", len(result.checks))
+ }
+}
diff --git a/internal/run.go b/internal/run.go
index eed8ad5..f45f937 100644
--- a/internal/run.go
+++ b/internal/run.go
@@ -23,6 +23,7 @@ func Run(ctx context.Context, configFile string, renotify, force bool) {
}
state = runChecks(ctx, state, conf)
+ state = mergePrometheusAlerts(ctx, state, conf)
state = mergeFederated(ctx, state, conf)
if err := state.persist(); err != nil {