diff options
| author | Paul Buetow <paul@buetow.org> | 2026-01-27 18:16:04 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-01-27 18:16:04 +0200 |
| commit | 115f2b371bef591a114a072794116b160fc15907 (patch) | |
| tree | 8dbf5a3c710f3f1ba2e0d1d8b56e341011ef1f56 | |
| parent | 4dcd85d27d77439daca8223acf49e2166e0c0996 (diff) | |
feat: add minimum notification interval for email batching
Add MinNotifyIntervalS config option to batch email notifications over a
time interval. When configured, Gogios only sends an email when:
1. The interval has elapsed since the last notification, AND
2. There's been a state change since the last notification.
HTML status page and text reports continue updating on every run.
The --force flag bypasses the interval for immediate notifications.
Notification state (timestamp + check states snapshot) is persisted to
{StateDir}/notify_state.json for comparison across runs.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
| -rw-r--r-- | internal/config.go | 6 | ||||
| -rw-r--r-- | internal/notify_state.go | 114 | ||||
| -rw-r--r-- | internal/notify_state_test.go | 225 | ||||
| -rw-r--r-- | internal/run.go | 38 |
4 files changed, 380 insertions, 3 deletions
diff --git a/internal/config.go b/internal/config.go index c3a0d5f..ffc2353 100644 --- a/internal/config.go +++ b/internal/config.go @@ -21,6 +21,12 @@ type config struct { CheckConcurrency int StaleThreshold int `json:"StaleThreshold,omitempty"` Federated []string `json:"Federated,omitempty"` // TODO: Document this option + // MinNotifyIntervalS is the minimum interval in seconds between email notifications. + // When set > 0, Gogios batches notifications and only sends an email when: + // 1. The interval has elapsed since the last notification, AND + // 2. There's been a state change since the last notification. + // Set to 0 (default) for immediate notifications on every state change. + MinNotifyIntervalS int `json:"MinNotifyIntervalS,omitempty"` PrometheusHosts []string `json:"PrometheusHosts,omitempty"` PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"` PrometheusOnlyIfNotExists string `json:"PrometheusOnlyIfNotExists,omitempty"` // Suppress Prometheus alerts if this file exists and is recent diff --git a/internal/notify_state.go b/internal/notify_state.go new file mode 100644 index 0000000..9c3ccd4 --- /dev/null +++ b/internal/notify_state.go @@ -0,0 +1,114 @@ +package internal + +import ( + "encoding/json" + "os" + "path/filepath" + "time" +) + +// notifyState tracks the last notification timestamp and the check states at that time. +// This enables notification batching: Gogios can suppress emails until the configured +// interval has elapsed AND there's been an actual state change since the last notification. +type notifyState struct { + stateFile string `json:"-"` + LastNotifyEpoch int64 `json:"LastNotifyEpoch"` + CheckStates map[string]int `json:"CheckStates"` // check name -> status code at last notification +} + +// newNotifyState loads the notification state from disk, or returns an empty state +// if no previous state exists (first run scenario). +func newNotifyState(stateDir string) (notifyState, error) { + ns := notifyState{ + stateFile: filepath.Join(stateDir, "notify_state.json"), + CheckStates: make(map[string]int), + } + + data, err := os.ReadFile(ns.stateFile) + if err != nil { + if os.IsNotExist(err) { + // First run - no previous notification state + return ns, nil + } + return ns, err + } + + if err := json.Unmarshal(data, &ns); err != nil { + return ns, err + } + + return ns, nil +} + +// intervalElapsed returns true if the minimum notification interval has passed +// since the last notification was sent. +func (ns notifyState) intervalElapsed(minIntervalS int) bool { + if ns.LastNotifyEpoch == 0 { + // No previous notification - interval is considered elapsed + return true + } + elapsed := time.Now().Unix() - ns.LastNotifyEpoch + return elapsed >= int64(minIntervalS) +} + +// hasChanges compares the current check states to the snapshot taken at the last +// notification. Returns true if any check has changed status, if new checks were +// added, or if checks were removed. +func (ns notifyState) hasChanges(currentState state) bool { + // Check for status changes or new checks + for name, cs := range currentState.checks { + prevStatus, exists := ns.CheckStates[name] + if !exists { + // New check appeared since last notification + return true + } + if int(cs.Status) != prevStatus { + // Status changed since last notification + return true + } + } + + // Check for removed checks + for name := range ns.CheckStates { + if _, exists := currentState.checks[name]; !exists { + return true + } + } + + return false +} + +// recordNotification saves the current timestamp and a snapshot of all check states. +// Call this after successfully sending a notification email. +func (ns *notifyState) recordNotification(currentState state) error { + ns.LastNotifyEpoch = time.Now().Unix() + ns.CheckStates = make(map[string]int) + + for name, cs := range currentState.checks { + ns.CheckStates[name] = int(cs.Status) + } + + return ns.persist() +} + +// persist writes the notification state to disk atomically using a temp file. +func (ns notifyState) persist() error { + stateDir := filepath.Dir(ns.stateFile) + if _, err := os.Stat(stateDir); os.IsNotExist(err) { + if err := os.MkdirAll(stateDir, 0o755); err != nil { + return err + } + } + + data, err := json.Marshal(ns) + if err != nil { + return err + } + + tmpFile := ns.stateFile + ".tmp" + if err := os.WriteFile(tmpFile, data, 0o644); err != nil { + return err + } + + return os.Rename(tmpFile, ns.stateFile) +} diff --git a/internal/notify_state_test.go b/internal/notify_state_test.go new file mode 100644 index 0000000..4f49471 --- /dev/null +++ b/internal/notify_state_test.go @@ -0,0 +1,225 @@ +package internal + +import ( + "os" + "path/filepath" + "testing" + "time" +) + +func TestNewNotifyState_FirstRun(t *testing.T) { + // First run with no existing state file should return empty state + tmpDir := t.TempDir() + ns, err := newNotifyState(tmpDir) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if ns.LastNotifyEpoch != 0 { + t.Errorf("expected LastNotifyEpoch=0, got %d", ns.LastNotifyEpoch) + } + if len(ns.CheckStates) != 0 { + t.Errorf("expected empty CheckStates, got %d entries", len(ns.CheckStates)) + } +} + +func TestNotifyState_Persistence(t *testing.T) { + // Test round-trip: save and load notification state + tmpDir := t.TempDir() + ns, err := newNotifyState(tmpDir) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Create a mock state and record notification + mockState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + "Check B": {Status: nagiosCritical}, + }} + + if err := ns.recordNotification(mockState); err != nil { + t.Fatalf("failed to record notification: %v", err) + } + + // Load the state back + ns2, err := newNotifyState(tmpDir) + if err != nil { + t.Fatalf("failed to reload state: %v", err) + } + + if ns2.LastNotifyEpoch == 0 { + t.Error("expected non-zero LastNotifyEpoch after reload") + } + if len(ns2.CheckStates) != 2 { + t.Errorf("expected 2 check states, got %d", len(ns2.CheckStates)) + } + if ns2.CheckStates["Check A"] != int(nagiosOk) { + t.Errorf("expected Check A status=%d, got %d", nagiosOk, ns2.CheckStates["Check A"]) + } + if ns2.CheckStates["Check B"] != int(nagiosCritical) { + t.Errorf("expected Check B status=%d, got %d", nagiosCritical, ns2.CheckStates["Check B"]) + } +} + +func TestIntervalElapsed_FirstRun(t *testing.T) { + // First run (LastNotifyEpoch=0) should always return true + ns := notifyState{LastNotifyEpoch: 0} + if !ns.intervalElapsed(3600) { + t.Error("expected intervalElapsed=true on first run") + } +} + +func TestIntervalElapsed_NotYet(t *testing.T) { + // Notification sent 30 seconds ago, interval is 60 seconds + ns := notifyState{LastNotifyEpoch: time.Now().Unix() - 30} + if ns.intervalElapsed(60) { + t.Error("expected intervalElapsed=false when only 30s of 60s elapsed") + } +} + +func TestIntervalElapsed_Elapsed(t *testing.T) { + // Notification sent 120 seconds ago, interval is 60 seconds + ns := notifyState{LastNotifyEpoch: time.Now().Unix() - 120} + if !ns.intervalElapsed(60) { + t.Error("expected intervalElapsed=true when 120s of 60s elapsed") + } +} + +func TestIntervalElapsed_ZeroInterval(t *testing.T) { + // Interval of 0 should always return true (immediate notification mode) + ns := notifyState{LastNotifyEpoch: time.Now().Unix()} + if !ns.intervalElapsed(0) { + t.Error("expected intervalElapsed=true when interval is 0") + } +} + +func TestHasChanges_NoChanges(t *testing.T) { + ns := notifyState{ + CheckStates: map[string]int{ + "Check A": int(nagiosOk), + "Check B": int(nagiosCritical), + }, + } + + currentState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + "Check B": {Status: nagiosCritical}, + }} + + if ns.hasChanges(currentState) { + t.Error("expected hasChanges=false when states are identical") + } +} + +func TestHasChanges_StatusChanged(t *testing.T) { + ns := notifyState{ + CheckStates: map[string]int{ + "Check A": int(nagiosOk), + "Check B": int(nagiosOk), + }, + } + + currentState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + "Check B": {Status: nagiosCritical}, // Changed from OK to CRITICAL + }} + + if !ns.hasChanges(currentState) { + t.Error("expected hasChanges=true when check status changed") + } +} + +func TestHasChanges_NewCheck(t *testing.T) { + ns := notifyState{ + CheckStates: map[string]int{ + "Check A": int(nagiosOk), + }, + } + + currentState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + "Check B": {Status: nagiosCritical}, // New check + }} + + if !ns.hasChanges(currentState) { + t.Error("expected hasChanges=true when new check added") + } +} + +func TestHasChanges_RemovedCheck(t *testing.T) { + ns := notifyState{ + CheckStates: map[string]int{ + "Check A": int(nagiosOk), + "Check B": int(nagiosCritical), + }, + } + + currentState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + // Check B removed + }} + + if !ns.hasChanges(currentState) { + t.Error("expected hasChanges=true when check removed") + } +} + +func TestHasChanges_EmptyPrevious(t *testing.T) { + // First notification - no previous state + ns := notifyState{ + CheckStates: map[string]int{}, + } + + currentState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + }} + + if !ns.hasChanges(currentState) { + t.Error("expected hasChanges=true on first run with checks") + } +} + +func TestHasChanges_BothEmpty(t *testing.T) { + ns := notifyState{ + CheckStates: map[string]int{}, + } + + currentState := state{checks: map[string]checkState{}} + + if ns.hasChanges(currentState) { + t.Error("expected hasChanges=false when both states are empty") + } +} + +func TestRecordNotification(t *testing.T) { + tmpDir := t.TempDir() + ns, _ := newNotifyState(tmpDir) + + mockState := state{checks: map[string]checkState{ + "Check A": {Status: nagiosOk}, + "Check B": {Status: nagiosWarning}, + "Check C": {Status: nagiosCritical}, + }} + + beforeRecord := time.Now().Unix() + if err := ns.recordNotification(mockState); err != nil { + t.Fatalf("failed to record notification: %v", err) + } + afterRecord := time.Now().Unix() + + // Verify timestamp is within expected range + if ns.LastNotifyEpoch < beforeRecord || ns.LastNotifyEpoch > afterRecord { + t.Errorf("LastNotifyEpoch=%d not in range [%d, %d]", ns.LastNotifyEpoch, beforeRecord, afterRecord) + } + + // Verify all check states were captured + if len(ns.CheckStates) != 3 { + t.Errorf("expected 3 check states, got %d", len(ns.CheckStates)) + } + + // Verify state file was created + stateFile := filepath.Join(tmpDir, "notify_state.json") + if _, err := os.Stat(stateFile); os.IsNotExist(err) { + t.Error("expected state file to be created") + } +} diff --git a/internal/run.go b/internal/run.go index 9d1b21c..63ee16f 100644 --- a/internal/run.go +++ b/internal/run.go @@ -7,10 +7,10 @@ import ( "os" ) -func Run(ctx context.Context, configFile string, renotify, force bool) { +func Run(ctx context.Context, configFile string, renotify, force bool) error { conf, err := newConfig(configFile) if err != nil { - log.Fatal(err) + return err } if err := conf.sanityCheck(); err != nil { @@ -22,6 +22,13 @@ func Run(ctx context.Context, configFile string, renotify, force bool) { notifyError(conf, err) } + // Load notification state for batching (tracks when last email was sent + // and what the check states were at that time) + notifyStateData, err := newNotifyState(conf.StateDir) + if err != nil { + log.Println("warning: failed to load notification state:", err) + } + state = runChecks(ctx, state, conf) state = mergePrometheusAlerts(ctx, state, conf) state = mergeFederated(ctx, state, conf) @@ -31,12 +38,35 @@ func Run(ctx context.Context, configFile string, renotify, force bool) { } subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf) + + // Apply notification batching when MinNotifyIntervalS is configured. + // Force flag bypasses batching to allow immediate notifications when needed. + if doNotify && conf.MinNotifyIntervalS > 0 && !force { + if notifyStateData.intervalElapsed(conf.MinNotifyIntervalS) { + // Interval has elapsed - only notify if state changed since last notification + if !notifyStateData.hasChanges(state) { + doNotify = false + log.Println("Notification suppressed: interval elapsed but no state changes since last notification") + } + } else { + // Interval has not elapsed - suppress notification + doNotify = false + log.Println("Notification suppressed: minimum interval not elapsed") + } + } + if doNotify { if err := notify(conf, subject, body); err != nil { log.Println("error:", err) - return + return nil + } + // Record notification timestamp and state snapshot for batching + if err := notifyStateData.recordNotification(state); err != nil { + log.Println("warning: failed to save notification state:", err) } } + + // Text and HTML reports always update regardless of notification batching if err := persistReport(subject, body, conf); err != nil { notifyError(conf, err) } @@ -47,6 +77,8 @@ func Run(ctx context.Context, configFile string, renotify, force bool) { notifyError(conf, err) } } + + return nil } func persistReport(subject, body string, conf config) error { |
