diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/check.go | 16 | ||||
| -rw-r--r-- | internal/config.go | 12 | ||||
| -rw-r--r-- | internal/html.go | 71 | ||||
| -rw-r--r-- | internal/html_test.go | 5 | ||||
| -rw-r--r-- | internal/run.go | 2 | ||||
| -rw-r--r-- | internal/state.go | 83 | ||||
| -rw-r--r-- | internal/suppress.go | 59 | ||||
| -rw-r--r-- | internal/suppress_test.go | 163 |
8 files changed, 370 insertions, 41 deletions
diff --git a/internal/check.go b/internal/check.go index a171c29..aabb3ec 100644 --- a/internal/check.go +++ b/internal/check.go @@ -9,13 +9,15 @@ import ( ) type check struct { - Plugin string - Args []string - DependsOn []string `json:"DependsOn,omitempty"` - Retries int `json:"Retries,omitempty"` - RetryInterval int `json:"RetryInterval,omitempty"` - RunInterval int `json:"RunInterval,omitempty"` - RandomSpread int `json:"RandomSpread,omitempty"` + Plugin string + Args []string + DependsOn []string `json:"DependsOn,omitempty"` + Retries int `json:"Retries,omitempty"` + RetryInterval int `json:"RetryInterval,omitempty"` + RunInterval int `json:"RunInterval,omitempty"` + RandomSpread int `json:"RandomSpread,omitempty"` + OnlyIfNotExists string `json:"OnlyIfNotExists,omitempty"` // Suppress alerts if this file exists and is recent + OnlyIfNotExistsMaxS int `json:"OnlyIfNotExistsMaxS,omitempty"` // Max age in seconds for suppression file (uses global default if 0) } type namedCheck struct { diff --git a/internal/config.go b/internal/config.go index 5d172d6..c3a0d5f 100644 --- a/internal/config.go +++ b/internal/config.go @@ -21,9 +21,11 @@ type config struct { CheckConcurrency int StaleThreshold int `json:"StaleThreshold,omitempty"` Federated []string `json:"Federated,omitempty"` // TODO: Document this option - PrometheusHosts []string `json:"PrometheusHosts,omitempty"` - PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"` - Checks map[string]check + PrometheusHosts []string `json:"PrometheusHosts,omitempty"` + PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"` + PrometheusOnlyIfNotExists string `json:"PrometheusOnlyIfNotExists,omitempty"` // Suppress Prometheus alerts if this file exists and is recent + PrometheusOnlyIfNotExistsMaxS int `json:"PrometheusOnlyIfNotExistsMaxS,omitempty"` // Max age in seconds for suppression file (default 86400) + Checks map[string]check } func newConfig(configFile string) (config, error) { @@ -67,6 +69,10 @@ func newConfig(configFile string) (config, error) { conf.PrometheusTimeoutS = 2 // Default to 2 seconds } + if conf.PrometheusOnlyIfNotExistsMaxS == 0 { + conf.PrometheusOnlyIfNotExistsMaxS = 86400 // Default to 24 hours + } + if !conf.HTMLDisable && conf.HTMLStatusFile == "" { conf.HTMLStatusFile = "/var/www/htdocs/buetow.org/self/gogios/index.html" log.Println("Set HTMLStatusFile to " + conf.HTMLStatusFile) diff --git a/internal/html.go b/internal/html.go index 1ccb894..facb575 100644 --- a/internal/html.go +++ b/internal/html.go @@ -40,7 +40,7 @@ func persistHTMLReport(state state, subject string, conf config) error { } defer f.Close() - htmlContent := state.htmlReport(subject) + htmlContent := state.htmlReport(subject, conf) if _, err = f.WriteString(htmlContent); err != nil { log.Println("debug: error writing HTML:", err) return fmt.Errorf("failed to write HTML: %w", err) @@ -58,23 +58,30 @@ func persistHTMLReport(state state, subject string, conf config) error { // htmlReport generates the complete HTML status page. // Mirrors state.report() pattern from state.go:133-163. -func (s state) htmlReport(subject string) string { +// Note: HTML report shows full state without suppression in main sections for visibility, +// but includes a dedicated "Suppressed alerts" section showing which checks are muted. +func (s state) htmlReport(subject string, conf config) string { var sb strings.Builder + // Use empty config for main sections so no checks are suppressed. + // The HTML status page shows full state for visibility. + emptyConf := config{} + // Calculate counts for header summary (without generating HTML yet) - numCriticals := s.countBy(func(cs checkState) bool { + numCriticals := s.countBy(emptyConf, func(cs checkState) bool { return cs.Status == nagiosCritical }) - numWarnings := s.countBy(func(cs checkState) bool { + numWarnings := s.countBy(emptyConf, func(cs checkState) bool { return cs.Status == nagiosWarning }) - numUnknown := s.countBy(func(cs checkState) bool { + numUnknown := s.countBy(emptyConf, func(cs checkState) bool { return cs.Status == nagiosUnknown }) - numOK := s.countBy(func(cs checkState) bool { + numOK := s.countBy(emptyConf, func(cs checkState) bool { return cs.Status == nagiosOk }) - numStale := s.countStale() + numStale := s.countStale(emptyConf) + numSuppressed := s.countSuppressed(conf) // Write HTML header with summary sb.WriteString(htmlHeader(subject, numCriticals, numWarnings, numUnknown, numStale, numOK)) @@ -109,6 +116,16 @@ func (s state) htmlReport(subject string) string { } sb.WriteString(`</div>` + "\n\n") + // Suppressed alerts section + sb.WriteString(`<div class="section">` + "\n") + sb.WriteString(`<h2>Suppressed alerts</h2>` + "\n") + if numSuppressed == 0 { + sb.WriteString(`<p>There are no suppressed alerts...</p>` + "\n") + } else { + s.htmlReportSuppressed(&sb, conf) + } + sb.WriteString(`</div>` + "\n\n") + // OK checks section sb.WriteString(`<div class="section">` + "\n") sb.WriteString(`<h2>OK checks</h2>` + "\n") @@ -181,6 +198,42 @@ func (s state) htmlReportStaleAlerts(sb *strings.Builder) int { }) } +// htmlReportSuppressed generates HTML for suppressed checks. +// Shows which checks are currently muted via OnlyIfNotExists for visibility. +func (s state) htmlReportSuppressed(sb *strings.Builder, conf config) (count int) { + for name, cs := range s.checks { + if !isCheckSuppressed(name, conf) { + continue + } + count++ + + sb.WriteString(`<div class="check-item">` + "\n") + sb.WriteString(htmlStatusBadge(nagiosCode(cs.Status))) + sb.WriteString(": ") + sb.WriteString(html.EscapeString(name)) + sb.WriteString(": ") + sb.WriteString(html.EscapeString(cs.Output)) + if cs.federated() { + sb.WriteString(" [federated from ") + sb.WriteString(html.EscapeString(cs.FederatedFrom)) + sb.WriteString("]") + } + sb.WriteString(` <span class="UNKNOWN">[SUPPRESSED]</span>`) + sb.WriteString("\n</div>\n") + } + return +} + +// countSuppressed counts the number of suppressed checks. +func (s state) countSuppressed(conf config) (count int) { + for name := range s.checks { + if isCheckSuppressed(name, conf) { + count++ + } + } + return +} + // htmlReportBy is the generic HTML generator for check items. // Mirrors state.reportBy() from state.go:222-262 but outputs HTML. func (s state) htmlReportBy(sb *strings.Builder, showStatusChange, isStaleReport bool, @@ -231,8 +284,8 @@ func (s state) htmlReportBy(sb *strings.Builder, showStatusChange, isStaleReport // countStale counts the number of stale checks (excluding OK status). // Helper function for generating summary counts. -func (s state) countStale() int { - return s.countBy(func(cs checkState) bool { +func (s state) countStale(conf config) int { + return s.countBy(conf, func(cs checkState) bool { return cs.Epoch < s.staleEpoch && cs.Status != nagiosOk }) } diff --git a/internal/html_test.go b/internal/html_test.go index d482eda..b77c937 100644 --- a/internal/html_test.go +++ b/internal/html_test.go @@ -219,7 +219,7 @@ func TestHtmlReport(t *testing.T) { } subject := "GOGIOS Report [C:1 W:1 U:0 S:1 OK:2]" - result := s.htmlReport(subject) + result := s.htmlReport(subject, config{}) // Check that all major sections are present expectedSections := []string{ @@ -228,6 +228,7 @@ func TestHtmlReport(t *testing.T) { "Alerts with status changed", "Unhandled alerts", "Stale alerts", + "Suppressed alerts", "Generated by Gogios", "</html>", } @@ -399,7 +400,7 @@ func TestW3CCompliance(t *testing.T) { } subject := "GOGIOS Report [C:1 W:0 U:0 S:0 OK:0]" - html := s.htmlReport(subject) + html := s.htmlReport(subject, config{}) // W3C HTML5 Required Elements requiredElements := map[string]string{ diff --git a/internal/run.go b/internal/run.go index 348bdd9..9d1b21c 100644 --- a/internal/run.go +++ b/internal/run.go @@ -30,7 +30,7 @@ func Run(ctx context.Context, configFile string, renotify, force bool) { notifyError(conf, err) } - subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL) + subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf) if doNotify { if err := notify(conf, subject, body); err != nil { log.Println("error:", err) diff --git a/internal/state.go b/internal/state.go index cb2c665..4df7757 100644 --- a/internal/state.go +++ b/internal/state.go @@ -132,30 +132,37 @@ func (s state) persist() error { // report generates the notification email content. // statusPageURL is included as a link to the HTML status page. -func (s state) report(renotify, force bool, statusPageURL string) (string, string, bool) { +// conf is used to determine which checks should be suppressed from the report. +func (s state) report(renotify, force bool, statusPageURL string, conf config) (string, string, bool) { var sb strings.Builder sb.WriteString("This is the recent Gogios report!\n\n") sb.WriteString("# Alerts with status changed:\n\n") - changed := s.reportChanged(&sb) + changed := s.reportChanged(&sb, conf) if !changed { sb.WriteString("There were no status changes...\n\n") } sb.WriteString("# Unhandled alerts:\n\n") - numCriticals, numWarnings, numUnknown, numOK := s.reportUnhandled(&sb) + numCriticals, numWarnings, numUnknown, numOK := s.reportUnhandled(&sb, conf) hasUnhandled := (numCriticals + numWarnings + numUnknown) > 0 if !hasUnhandled { sb.WriteString("There are no unhandled alerts...\n\n") } sb.WriteString("# Stale alerts:\n\n") - numStale := s.reportStaleAlerts(&sb) + numStale := s.reportStaleAlerts(&sb, conf) if numStale == 0 { sb.WriteString("There are no stale alerts...\n\n") } + sb.WriteString("# Suppressed alerts:\n\n") + numSuppressed := s.reportSuppressed(&sb, conf) + if numSuppressed == 0 { + sb.WriteString("There are no suppressed alerts...\n\n") + } + sb.WriteString("# Status page:\n\n") sb.WriteString(statusPageURL) sb.WriteString("\n\n") @@ -169,26 +176,26 @@ func (s state) report(renotify, force bool, statusPageURL string) (string, strin return subject, sb.String(), doNotify } -func (s state) reportChanged(sb *strings.Builder) (changed bool) { - if 0 < s.reportBy(sb, true, false, func(cs checkState) bool { +func (s state) reportChanged(sb *strings.Builder, conf config) (changed bool) { + if 0 < s.reportBy(sb, true, false, conf, func(cs checkState) bool { return cs.Status == nagiosCritical && cs.changed() }) { changed = true } - if 0 < s.reportBy(sb, true, false, func(cs checkState) bool { + if 0 < s.reportBy(sb, true, false, conf, func(cs checkState) bool { return cs.Status == nagiosWarning && cs.changed() }) { changed = true } - if 0 < s.reportBy(sb, true, false, func(cs checkState) bool { + if 0 < s.reportBy(sb, true, false, conf, func(cs checkState) bool { return cs.Status == nagiosUnknown && cs.changed() }) { changed = true } - if 0 < s.reportBy(sb, true, false, func(cs checkState) bool { + if 0 < s.reportBy(sb, true, false, conf, func(cs checkState) bool { return cs.Status == nagiosOk && cs.changed() }) { changed = true @@ -197,37 +204,68 @@ func (s state) reportChanged(sb *strings.Builder) (changed bool) { return } -func (s state) reportUnhandled(sb *strings.Builder) (numCriticals, numWarnings, +func (s state) reportUnhandled(sb *strings.Builder, conf config) (numCriticals, numWarnings, numUnknown, numOK int, ) { - numCriticals = s.reportBy(sb, false, false, func(cs checkState) bool { + numCriticals = s.reportBy(sb, false, false, conf, func(cs checkState) bool { return cs.Status == nagiosCritical }) - numWarnings = s.reportBy(sb, false, false, func(cs checkState) bool { + numWarnings = s.reportBy(sb, false, false, conf, func(cs checkState) bool { return cs.Status == nagiosWarning }) - numUnknown = s.reportBy(sb, false, false, func(cs checkState) bool { + numUnknown = s.reportBy(sb, false, false, conf, func(cs checkState) bool { return cs.Status == nagiosUnknown }) - numOK = s.countBy(func(cs checkState) bool { + numOK = s.countBy(conf, func(cs checkState) bool { return cs.Status == nagiosOk }) return } -func (s state) reportStaleAlerts(sb *strings.Builder) int { +func (s state) reportStaleAlerts(sb *strings.Builder, conf config) int { // Only report stale alerts that are not OK, since stale OK alerts aren't concerning - return s.reportBy(sb, false, true, func(cs checkState) bool { + return s.reportBy(sb, false, true, conf, func(cs checkState) bool { return cs.Epoch < s.staleEpoch && cs.Status != nagiosOk }) } +// reportSuppressed lists all checks that are currently suppressed via OnlyIfNotExists. +// This provides visibility into which alerts are being muted during maintenance windows. +func (s state) reportSuppressed(sb *strings.Builder, conf config) (count int) { + for name, cs := range s.checks { + if !isCheckSuppressed(name, conf) { + continue + } + count++ + + sb.WriteString(nagiosCode(cs.Status).Str()) + sb.WriteString(": ") + sb.WriteString(name) + sb.WriteString(": ") + sb.WriteString(cs.Output) + if cs.federated() { + sb.WriteString(" [federated from ") + sb.WriteString(cs.FederatedFrom) + sb.WriteString("]") + } + sb.WriteString(" [SUPPRESSED]") + sb.WriteString("\n") + } + + if count > 0 { + sb.WriteString("\n") + } + return +} + +// reportBy iterates over checks matching the filter and writes them to sb. +// Checks that are suppressed via OnlyIfNotExists are excluded from the report. func (s state) reportBy(sb *strings.Builder, showStatusChange, isStaleReport bool, - filter func(cs checkState) bool, + conf config, filter func(cs checkState) bool, ) (count int) { for name, cs := range s.checks { if !filter(cs) { @@ -236,6 +274,9 @@ func (s state) reportBy(sb *strings.Builder, showStatusChange, isStaleReport boo if !isStaleReport && cs.Epoch < s.staleEpoch { continue // skip stale checks in non-stale report } + if isCheckSuppressed(name, conf) { + continue // skip suppressed checks + } count++ if showStatusChange && cs.changed() { @@ -268,8 +309,12 @@ func (s state) reportBy(sb *strings.Builder, showStatusChange, isStaleReport boo return } -func (s state) countBy(filter func(cs checkState) bool) (count int) { - for _, cs := range s.checks { +// countBy counts checks matching the filter, excluding suppressed checks. +func (s state) countBy(conf config, filter func(cs checkState) bool) (count int) { + for name, cs := range s.checks { + if isCheckSuppressed(name, conf) { + continue // skip suppressed checks + } if filter(cs) { count++ } diff --git a/internal/suppress.go b/internal/suppress.go new file mode 100644 index 0000000..a270656 --- /dev/null +++ b/internal/suppress.go @@ -0,0 +1,59 @@ +package internal + +import ( + "log" + "os" + "strings" + "time" +) + +// isSuppressed checks if alerts should be suppressed based on a file's existence and age. +// Returns true if the file exists AND its modification time is within maxAgeS seconds of now. +// Returns false if filePath is empty, file doesn't exist, or file is too old. +func isSuppressed(filePath string, maxAgeS int) bool { + if filePath == "" { + return false + } + info, err := os.Stat(filePath) + if err != nil { + return false // file doesn't exist or other error + } + age := time.Since(info.ModTime()) + return age <= time.Duration(maxAgeS)*time.Second +} + +// isCheckSuppressed determines if a check should be suppressed from email reports. +// For Prometheus checks (name starts with "Prometheus"): uses PrometheusOnlyIfNotExists config. +// For regular checks: uses per-check OnlyIfNotExists config if set. +func isCheckSuppressed(name string, conf config) bool { + // Check if this is a Prometheus alert (name starts with "Prometheus") + if strings.HasPrefix(name, "Prometheus") { + if isSuppressed(conf.PrometheusOnlyIfNotExists, conf.PrometheusOnlyIfNotExistsMaxS) { + log.Printf("Suppressing %s: file %s exists and is recent", name, conf.PrometheusOnlyIfNotExists) + return true + } + return false + } + + // For regular checks, look up the check config + chk, ok := conf.Checks[name] + if !ok { + return false // check not found in config (e.g., federated) + } + + if chk.OnlyIfNotExists == "" { + return false + } + + // Use per-check max age if set, otherwise use global Prometheus default + maxAgeS := chk.OnlyIfNotExistsMaxS + if maxAgeS == 0 { + maxAgeS = conf.PrometheusOnlyIfNotExistsMaxS + } + + if isSuppressed(chk.OnlyIfNotExists, maxAgeS) { + log.Printf("Suppressing %s: file %s exists and is recent", name, chk.OnlyIfNotExists) + return true + } + return false +} diff --git a/internal/suppress_test.go b/internal/suppress_test.go new file mode 100644 index 0000000..2a399d7 --- /dev/null +++ b/internal/suppress_test.go @@ -0,0 +1,163 @@ +package internal + +import ( + "os" + "testing" + "time" +) + +func TestIsSuppressed_EmptyPath(t *testing.T) { + // Empty file path should not suppress + if isSuppressed("", 86400) { + t.Error("Expected empty path to not suppress") + } +} + +func TestIsSuppressed_NonExistentFile(t *testing.T) { + // Non-existent file should not suppress + if isSuppressed("/nonexistent/path/to/file", 86400) { + t.Error("Expected non-existent file to not suppress") + } +} + +func TestIsSuppressed_RecentFile(t *testing.T) { + // Create a temporary file + tmpFile, err := os.CreateTemp("", "suppress_test") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + defer os.Remove(tmpFile.Name()) + tmpFile.Close() + + // Recent file should suppress + if !isSuppressed(tmpFile.Name(), 86400) { + t.Error("Expected recent file to suppress") + } +} + +func TestIsSuppressed_OldFile(t *testing.T) { + // Create a temporary file + tmpFile, err := os.CreateTemp("", "suppress_test") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + defer os.Remove(tmpFile.Name()) + tmpFile.Close() + + // Set the file's modification time to 2 hours ago + oldTime := time.Now().Add(-2 * time.Hour) + if err := os.Chtimes(tmpFile.Name(), oldTime, oldTime); err != nil { + t.Fatalf("Failed to change file time: %v", err) + } + + // File older than maxAgeS (1 hour = 3600s) should not suppress + if isSuppressed(tmpFile.Name(), 3600) { + t.Error("Expected old file to not suppress") + } + + // File within maxAgeS (3 hours = 10800s) should suppress + if !isSuppressed(tmpFile.Name(), 10800) { + t.Error("Expected file within max age to suppress") + } +} + +func TestIsCheckSuppressed_PrometheusCheck(t *testing.T) { + // Create a temporary file for Prometheus suppression + tmpFile, err := os.CreateTemp("", "prometheus_suppress_test") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + defer os.Remove(tmpFile.Name()) + tmpFile.Close() + + conf := config{ + PrometheusOnlyIfNotExists: tmpFile.Name(), + PrometheusOnlyIfNotExistsMaxS: 86400, + Checks: make(map[string]check), + } + + // Prometheus check should be suppressed when file exists + if !isCheckSuppressed("Prometheus: TestAlert", conf) { + t.Error("Expected Prometheus check to be suppressed") + } + + // Non-Prometheus check should not be affected by Prometheus suppression + conf.Checks["Regular Check"] = check{} + if isCheckSuppressed("Regular Check", conf) { + t.Error("Expected regular check to not be suppressed by Prometheus config") + } +} + +func TestIsCheckSuppressed_RegularCheck(t *testing.T) { + // Create a temporary file for check suppression + tmpFile, err := os.CreateTemp("", "check_suppress_test") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + defer os.Remove(tmpFile.Name()) + tmpFile.Close() + + conf := config{ + PrometheusOnlyIfNotExistsMaxS: 86400, + Checks: map[string]check{ + "Suppressed Check": { + OnlyIfNotExists: tmpFile.Name(), + OnlyIfNotExistsMaxS: 86400, + }, + "Normal Check": {}, + }, + } + + // Check with suppression file should be suppressed + if !isCheckSuppressed("Suppressed Check", conf) { + t.Error("Expected check with suppression file to be suppressed") + } + + // Check without suppression file should not be suppressed + if isCheckSuppressed("Normal Check", conf) { + t.Error("Expected check without suppression file to not be suppressed") + } + + // Unknown check (not in config) should not be suppressed + if isCheckSuppressed("Unknown Check", conf) { + t.Error("Expected unknown check to not be suppressed") + } +} + +func TestIsCheckSuppressed_UsesGlobalDefaultMaxAge(t *testing.T) { + // Create a temporary file + tmpFile, err := os.CreateTemp("", "suppress_test") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + defer os.Remove(tmpFile.Name()) + tmpFile.Close() + + // Set the file's modification time to 2 hours ago + oldTime := time.Now().Add(-2 * time.Hour) + if err := os.Chtimes(tmpFile.Name(), oldTime, oldTime); err != nil { + t.Fatalf("Failed to change file time: %v", err) + } + + // Config with short global max age (1 hour) + conf := config{ + PrometheusOnlyIfNotExistsMaxS: 3600, + Checks: map[string]check{ + "Test Check": { + OnlyIfNotExists: tmpFile.Name(), + OnlyIfNotExistsMaxS: 0, // Use global default + }, + }, + } + + // Should NOT be suppressed because file is older than global default (1 hour) + if isCheckSuppressed("Test Check", conf) { + t.Error("Expected check to not be suppressed when file is older than global max age") + } + + // Config with longer global max age (3 hours) + conf.PrometheusOnlyIfNotExistsMaxS = 10800 + if !isCheckSuppressed("Test Check", conf) { + t.Error("Expected check to be suppressed when file is within global max age") + } +} |
