diff options
| author | Paul Buetow <paul@buetow.org> | 2026-01-08 21:41:29 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-01-08 21:41:29 +0200 |
| commit | a10cbd4e27d944464cec88aaf49d8b8c354d26e1 (patch) | |
| tree | 0bdd0a23fbb8939c15544b857c74101cb5721a6c /internal | |
| parent | f5cffe240c44045684d4f74981235b060828550e (diff) | |
Add Prometheus alert scraping with configurable timeout and host failover
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/config.go | 8 | ||||
| -rw-r--r-- | internal/prometheus.go | 131 | ||||
| -rw-r--r-- | internal/prometheus_test.go | 159 | ||||
| -rw-r--r-- | internal/run.go | 1 |
4 files changed, 298 insertions, 1 deletions
diff --git a/internal/config.go b/internal/config.go index 2ade802..c129ab7 100644 --- a/internal/config.go +++ b/internal/config.go @@ -20,7 +20,9 @@ type config struct { CheckConcurrency int StaleThreshold int `json:"StaleThreshold,omitempty"` Federated []string `json:"Federated,omitempty"` // TODO: Document this option - Checks map[string]check + PrometheusHosts []string `json:"PrometheusHosts,omitempty"` + PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"` + Checks map[string]check } func newConfig(configFile string) (config, error) { @@ -60,6 +62,10 @@ func newConfig(configFile string) (config, error) { conf.StaleThreshold = 3600 // Default to 1 hour } + if conf.PrometheusTimeoutS == 0 { + conf.PrometheusTimeoutS = 2 // Default to 2 seconds + } + if !conf.HTMLDisable && conf.HTMLStatusFile == "" { conf.HTMLStatusFile = "/var/www/htdocs/buetow.org/self/gogios/index.html" log.Println("Set HTMLStatusFile to " + conf.HTMLStatusFile) diff --git a/internal/prometheus.go b/internal/prometheus.go new file mode 100644 index 0000000..1c06aaf --- /dev/null +++ b/internal/prometheus.go @@ -0,0 +1,131 @@ +package internal + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "time" +) + +type prometheusResponse struct { + Status string `json:"status"` + Data struct { + Alerts []prometheusAlert `json:"alerts"` + } `json:"data"` +} + +type prometheusAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + State string `json:"state"` +} + +func mergePrometheusAlerts(ctx context.Context, state state, conf config) state { + if len(conf.PrometheusHosts) == 0 { + return state + } + + timeout := time.Duration(conf.PrometheusTimeoutS) * time.Second + alerts, host, err := fetchPrometheusAlerts(ctx, conf.PrometheusHosts, timeout) + if err != nil { + log.Printf("Failed to fetch Prometheus alerts from any host: %v", err) + cs := checkResult{ + name: "Prometheus alerts", + output: fmt.Sprintf("CRITICAL: %v", err), + epoch: time.Now().Unix(), + status: nagiosCritical, + } + state.update(cs) + return state + } + + log.Printf("Fetched %d firing alerts from Prometheus host %s", len(alerts), host) + + for _, alert := range alerts { + if alert.State != "firing" { + continue + } + + name := fmt.Sprintf("Prometheus: %s", alert.Labels["alertname"]) + severity := alert.Labels["severity"] + description := alert.Annotations["summary"] + if description == "" { + description = alert.Annotations["description"] + } + if description == "" { + description = "no description" + } + + status := nagiosWarning + if severity == "critical" { + status = nagiosCritical + } + + cs := checkResult{ + name: name, + output: fmt.Sprintf("%s [%s]: %s", alert.Labels["alertname"], severity, description), + epoch: time.Now().Unix(), + status: status, + } + state.update(cs) + } + + return state +} + +func fetchPrometheusAlerts(ctx context.Context, hosts []string, timeout time.Duration) ([]prometheusAlert, string, error) { + var lastErr error + + for _, host := range hosts { + alerts, err := fetchFromHost(ctx, host, timeout) + if err != nil { + log.Printf("Failed to fetch from Prometheus host %s: %v", host, err) + lastErr = err + continue + } + return alerts, host, nil + } + + return nil, "", fmt.Errorf("all Prometheus hosts failed, last error: %w", lastErr) +} + +func fetchFromHost(ctx context.Context, host string, timeout time.Duration) ([]prometheusAlert, error) { + url := fmt.Sprintf("http://%s/api/v1/alerts", host) + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + var promResp prometheusResponse + if err := json.Unmarshal(body, &promResp); err != nil { + return nil, fmt.Errorf("failed to parse response: %w", err) + } + + if promResp.Status != "success" { + return nil, fmt.Errorf("prometheus returned status: %s", promResp.Status) + } + + return promResp.Data.Alerts, nil +} diff --git a/internal/prometheus_test.go b/internal/prometheus_test.go new file mode 100644 index 0000000..8d98e7f --- /dev/null +++ b/internal/prometheus_test.go @@ -0,0 +1,159 @@ +package internal + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestFetchFromHost(t *testing.T) { + tests := []struct { + name string + response prometheusResponse + statusCode int + wantAlertCount int + wantErr bool + }{ + { + name: "successful response with firing alerts", + response: prometheusResponse{ + Status: "success", + Data: struct { + Alerts []prometheusAlert `json:"alerts"` + }{ + Alerts: []prometheusAlert{ + { + Labels: map[string]string{"alertname": "HighCPU", "severity": "critical"}, + Annotations: map[string]string{"summary": "CPU usage is high"}, + State: "firing", + }, + { + Labels: map[string]string{"alertname": "DiskSpace", "severity": "warning"}, + Annotations: map[string]string{"summary": "Disk space low"}, + State: "firing", + }, + }, + }, + }, + statusCode: http.StatusOK, + wantAlertCount: 2, + wantErr: false, + }, + { + name: "empty alerts", + response: prometheusResponse{ + Status: "success", + Data: struct { + Alerts []prometheusAlert `json:"alerts"` + }{ + Alerts: []prometheusAlert{}, + }, + }, + statusCode: http.StatusOK, + wantAlertCount: 0, + wantErr: false, + }, + { + name: "server error", + response: prometheusResponse{}, + statusCode: http.StatusInternalServerError, + wantAlertCount: 0, + wantErr: true, + }, + { + name: "prometheus error status", + response: prometheusResponse{ + Status: "error", + }, + statusCode: http.StatusOK, + wantAlertCount: 0, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/alerts" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + w.WriteHeader(tt.statusCode) + _ = json.NewEncoder(w).Encode(tt.response) + })) + defer server.Close() + + host := strings.TrimPrefix(server.URL, "http://") + alerts, err := fetchFromHost(context.Background(), host, 2*time.Second) + + if tt.wantErr && err == nil { + t.Error("expected error, got nil") + } + if !tt.wantErr && err != nil { + t.Errorf("unexpected error: %v", err) + } + if len(alerts) != tt.wantAlertCount { + t.Errorf("got %d alerts, want %d", len(alerts), tt.wantAlertCount) + } + }) + } +} + +func TestFetchPrometheusAlertsFailover(t *testing.T) { + callCount := 0 + + server1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + time.Sleep(3 * time.Second) // exceed timeout + })) + defer server1.Close() + + server2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + resp := prometheusResponse{ + Status: "success", + Data: struct { + Alerts []prometheusAlert `json:"alerts"` + }{ + Alerts: []prometheusAlert{ + { + Labels: map[string]string{"alertname": "Test"}, + State: "firing", + }, + }, + }, + } + _ = json.NewEncoder(w).Encode(resp) + })) + defer server2.Close() + + hosts := []string{ + strings.TrimPrefix(server1.URL, "http://"), + strings.TrimPrefix(server2.URL, "http://"), + } + + alerts, host, err := fetchPrometheusAlerts(context.Background(), hosts, 2*time.Second) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if len(alerts) != 1 { + t.Errorf("got %d alerts, want 1", len(alerts)) + } + if host != hosts[1] { + t.Errorf("got host %s, want %s", host, hosts[1]) + } +} + +func TestMergePrometheusAlertsNoHosts(t *testing.T) { + conf := config{PrometheusHosts: nil} + s := state{checks: make(map[string]checkState)} + + result := mergePrometheusAlerts(context.Background(), s, conf) + + if len(result.checks) != 0 { + t.Errorf("expected no checks, got %d", len(result.checks)) + } +} diff --git a/internal/run.go b/internal/run.go index eed8ad5..f45f937 100644 --- a/internal/run.go +++ b/internal/run.go @@ -23,6 +23,7 @@ func Run(ctx context.Context, configFile string, renotify, force bool) { } state = runChecks(ctx, state, conf) + state = mergePrometheusAlerts(ctx, state, conf) state = mergeFederated(ctx, state, conf) if err := state.persist(); err != nil { |
