summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--internal/config.go6
-rw-r--r--internal/notify_state.go114
-rw-r--r--internal/notify_state_test.go225
-rw-r--r--internal/run.go38
4 files changed, 380 insertions, 3 deletions
diff --git a/internal/config.go b/internal/config.go
index c3a0d5f..ffc2353 100644
--- a/internal/config.go
+++ b/internal/config.go
@@ -21,6 +21,12 @@ type config struct {
CheckConcurrency int
StaleThreshold int `json:"StaleThreshold,omitempty"`
Federated []string `json:"Federated,omitempty"` // TODO: Document this option
+ // MinNotifyIntervalS is the minimum interval in seconds between email notifications.
+ // When set > 0, Gogios batches notifications and only sends an email when:
+ // 1. The interval has elapsed since the last notification, AND
+ // 2. There's been a state change since the last notification.
+ // Set to 0 (default) for immediate notifications on every state change.
+ MinNotifyIntervalS int `json:"MinNotifyIntervalS,omitempty"`
PrometheusHosts []string `json:"PrometheusHosts,omitempty"`
PrometheusTimeoutS int `json:"PrometheusTimeoutS,omitempty"`
PrometheusOnlyIfNotExists string `json:"PrometheusOnlyIfNotExists,omitempty"` // Suppress Prometheus alerts if this file exists and is recent
diff --git a/internal/notify_state.go b/internal/notify_state.go
new file mode 100644
index 0000000..9c3ccd4
--- /dev/null
+++ b/internal/notify_state.go
@@ -0,0 +1,114 @@
+package internal
+
+import (
+ "encoding/json"
+ "os"
+ "path/filepath"
+ "time"
+)
+
+// notifyState tracks the last notification timestamp and the check states at that time.
+// This enables notification batching: Gogios can suppress emails until the configured
+// interval has elapsed AND there's been an actual state change since the last notification.
+type notifyState struct {
+ stateFile string `json:"-"`
+ LastNotifyEpoch int64 `json:"LastNotifyEpoch"`
+ CheckStates map[string]int `json:"CheckStates"` // check name -> status code at last notification
+}
+
+// newNotifyState loads the notification state from disk, or returns an empty state
+// if no previous state exists (first run scenario).
+func newNotifyState(stateDir string) (notifyState, error) {
+ ns := notifyState{
+ stateFile: filepath.Join(stateDir, "notify_state.json"),
+ CheckStates: make(map[string]int),
+ }
+
+ data, err := os.ReadFile(ns.stateFile)
+ if err != nil {
+ if os.IsNotExist(err) {
+ // First run - no previous notification state
+ return ns, nil
+ }
+ return ns, err
+ }
+
+ if err := json.Unmarshal(data, &ns); err != nil {
+ return ns, err
+ }
+
+ return ns, nil
+}
+
+// intervalElapsed returns true if the minimum notification interval has passed
+// since the last notification was sent.
+func (ns notifyState) intervalElapsed(minIntervalS int) bool {
+ if ns.LastNotifyEpoch == 0 {
+ // No previous notification - interval is considered elapsed
+ return true
+ }
+ elapsed := time.Now().Unix() - ns.LastNotifyEpoch
+ return elapsed >= int64(minIntervalS)
+}
+
+// hasChanges compares the current check states to the snapshot taken at the last
+// notification. Returns true if any check has changed status, if new checks were
+// added, or if checks were removed.
+func (ns notifyState) hasChanges(currentState state) bool {
+ // Check for status changes or new checks
+ for name, cs := range currentState.checks {
+ prevStatus, exists := ns.CheckStates[name]
+ if !exists {
+ // New check appeared since last notification
+ return true
+ }
+ if int(cs.Status) != prevStatus {
+ // Status changed since last notification
+ return true
+ }
+ }
+
+ // Check for removed checks
+ for name := range ns.CheckStates {
+ if _, exists := currentState.checks[name]; !exists {
+ return true
+ }
+ }
+
+ return false
+}
+
+// recordNotification saves the current timestamp and a snapshot of all check states.
+// Call this after successfully sending a notification email.
+func (ns *notifyState) recordNotification(currentState state) error {
+ ns.LastNotifyEpoch = time.Now().Unix()
+ ns.CheckStates = make(map[string]int)
+
+ for name, cs := range currentState.checks {
+ ns.CheckStates[name] = int(cs.Status)
+ }
+
+ return ns.persist()
+}
+
+// persist writes the notification state to disk atomically using a temp file.
+func (ns notifyState) persist() error {
+ stateDir := filepath.Dir(ns.stateFile)
+ if _, err := os.Stat(stateDir); os.IsNotExist(err) {
+ if err := os.MkdirAll(stateDir, 0o755); err != nil {
+ return err
+ }
+ }
+
+ data, err := json.Marshal(ns)
+ if err != nil {
+ return err
+ }
+
+ tmpFile := ns.stateFile + ".tmp"
+ if err := os.WriteFile(tmpFile, data, 0o644); err != nil {
+ return err
+ }
+
+ return os.Rename(tmpFile, ns.stateFile)
+}
diff --git a/internal/notify_state_test.go b/internal/notify_state_test.go
new file mode 100644
index 0000000..4f49471
--- /dev/null
+++ b/internal/notify_state_test.go
@@ -0,0 +1,225 @@
+package internal
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+ "time"
+)
+
+func TestNewNotifyState_FirstRun(t *testing.T) {
+ // First run with no existing state file should return empty state
+ tmpDir := t.TempDir()
+ ns, err := newNotifyState(tmpDir)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if ns.LastNotifyEpoch != 0 {
+ t.Errorf("expected LastNotifyEpoch=0, got %d", ns.LastNotifyEpoch)
+ }
+ if len(ns.CheckStates) != 0 {
+ t.Errorf("expected empty CheckStates, got %d entries", len(ns.CheckStates))
+ }
+}
+
+func TestNotifyState_Persistence(t *testing.T) {
+ // Test round-trip: save and load notification state
+ tmpDir := t.TempDir()
+ ns, err := newNotifyState(tmpDir)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Create a mock state and record notification
+ mockState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ "Check B": {Status: nagiosCritical},
+ }}
+
+ if err := ns.recordNotification(mockState); err != nil {
+ t.Fatalf("failed to record notification: %v", err)
+ }
+
+ // Load the state back
+ ns2, err := newNotifyState(tmpDir)
+ if err != nil {
+ t.Fatalf("failed to reload state: %v", err)
+ }
+
+ if ns2.LastNotifyEpoch == 0 {
+ t.Error("expected non-zero LastNotifyEpoch after reload")
+ }
+ if len(ns2.CheckStates) != 2 {
+ t.Errorf("expected 2 check states, got %d", len(ns2.CheckStates))
+ }
+ if ns2.CheckStates["Check A"] != int(nagiosOk) {
+ t.Errorf("expected Check A status=%d, got %d", nagiosOk, ns2.CheckStates["Check A"])
+ }
+ if ns2.CheckStates["Check B"] != int(nagiosCritical) {
+ t.Errorf("expected Check B status=%d, got %d", nagiosCritical, ns2.CheckStates["Check B"])
+ }
+}
+
+func TestIntervalElapsed_FirstRun(t *testing.T) {
+ // First run (LastNotifyEpoch=0) should always return true
+ ns := notifyState{LastNotifyEpoch: 0}
+ if !ns.intervalElapsed(3600) {
+ t.Error("expected intervalElapsed=true on first run")
+ }
+}
+
+func TestIntervalElapsed_NotYet(t *testing.T) {
+ // Notification sent 30 seconds ago, interval is 60 seconds
+ ns := notifyState{LastNotifyEpoch: time.Now().Unix() - 30}
+ if ns.intervalElapsed(60) {
+ t.Error("expected intervalElapsed=false when only 30s of 60s elapsed")
+ }
+}
+
+func TestIntervalElapsed_Elapsed(t *testing.T) {
+ // Notification sent 120 seconds ago, interval is 60 seconds
+ ns := notifyState{LastNotifyEpoch: time.Now().Unix() - 120}
+ if !ns.intervalElapsed(60) {
+ t.Error("expected intervalElapsed=true when 120s of 60s elapsed")
+ }
+}
+
+func TestIntervalElapsed_ZeroInterval(t *testing.T) {
+ // Interval of 0 should always return true (immediate notification mode)
+ ns := notifyState{LastNotifyEpoch: time.Now().Unix()}
+ if !ns.intervalElapsed(0) {
+ t.Error("expected intervalElapsed=true when interval is 0")
+ }
+}
+
+func TestHasChanges_NoChanges(t *testing.T) {
+ ns := notifyState{
+ CheckStates: map[string]int{
+ "Check A": int(nagiosOk),
+ "Check B": int(nagiosCritical),
+ },
+ }
+
+ currentState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ "Check B": {Status: nagiosCritical},
+ }}
+
+ if ns.hasChanges(currentState) {
+ t.Error("expected hasChanges=false when states are identical")
+ }
+}
+
+func TestHasChanges_StatusChanged(t *testing.T) {
+ ns := notifyState{
+ CheckStates: map[string]int{
+ "Check A": int(nagiosOk),
+ "Check B": int(nagiosOk),
+ },
+ }
+
+ currentState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ "Check B": {Status: nagiosCritical}, // Changed from OK to CRITICAL
+ }}
+
+ if !ns.hasChanges(currentState) {
+ t.Error("expected hasChanges=true when check status changed")
+ }
+}
+
+func TestHasChanges_NewCheck(t *testing.T) {
+ ns := notifyState{
+ CheckStates: map[string]int{
+ "Check A": int(nagiosOk),
+ },
+ }
+
+ currentState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ "Check B": {Status: nagiosCritical}, // New check
+ }}
+
+ if !ns.hasChanges(currentState) {
+ t.Error("expected hasChanges=true when new check added")
+ }
+}
+
+func TestHasChanges_RemovedCheck(t *testing.T) {
+ ns := notifyState{
+ CheckStates: map[string]int{
+ "Check A": int(nagiosOk),
+ "Check B": int(nagiosCritical),
+ },
+ }
+
+ currentState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ // Check B removed
+ }}
+
+ if !ns.hasChanges(currentState) {
+ t.Error("expected hasChanges=true when check removed")
+ }
+}
+
+func TestHasChanges_EmptyPrevious(t *testing.T) {
+ // First notification - no previous state
+ ns := notifyState{
+ CheckStates: map[string]int{},
+ }
+
+ currentState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ }}
+
+ if !ns.hasChanges(currentState) {
+ t.Error("expected hasChanges=true on first run with checks")
+ }
+}
+
+func TestHasChanges_BothEmpty(t *testing.T) {
+ ns := notifyState{
+ CheckStates: map[string]int{},
+ }
+
+ currentState := state{checks: map[string]checkState{}}
+
+ if ns.hasChanges(currentState) {
+ t.Error("expected hasChanges=false when both states are empty")
+ }
+}
+
+func TestRecordNotification(t *testing.T) {
+ tmpDir := t.TempDir()
+ ns, _ := newNotifyState(tmpDir)
+
+ mockState := state{checks: map[string]checkState{
+ "Check A": {Status: nagiosOk},
+ "Check B": {Status: nagiosWarning},
+ "Check C": {Status: nagiosCritical},
+ }}
+
+ beforeRecord := time.Now().Unix()
+ if err := ns.recordNotification(mockState); err != nil {
+ t.Fatalf("failed to record notification: %v", err)
+ }
+ afterRecord := time.Now().Unix()
+
+ // Verify timestamp is within expected range
+ if ns.LastNotifyEpoch < beforeRecord || ns.LastNotifyEpoch > afterRecord {
+ t.Errorf("LastNotifyEpoch=%d not in range [%d, %d]", ns.LastNotifyEpoch, beforeRecord, afterRecord)
+ }
+
+ // Verify all check states were captured
+ if len(ns.CheckStates) != 3 {
+ t.Errorf("expected 3 check states, got %d", len(ns.CheckStates))
+ }
+
+ // Verify state file was created
+ stateFile := filepath.Join(tmpDir, "notify_state.json")
+ if _, err := os.Stat(stateFile); os.IsNotExist(err) {
+ t.Error("expected state file to be created")
+ }
+}
diff --git a/internal/run.go b/internal/run.go
index 9d1b21c..63ee16f 100644
--- a/internal/run.go
+++ b/internal/run.go
@@ -7,10 +7,10 @@ import (
"os"
)
-func Run(ctx context.Context, configFile string, renotify, force bool) {
+func Run(ctx context.Context, configFile string, renotify, force bool) error {
conf, err := newConfig(configFile)
if err != nil {
- log.Fatal(err)
+ return err
}
if err := conf.sanityCheck(); err != nil {
@@ -22,6 +22,13 @@ func Run(ctx context.Context, configFile string, renotify, force bool) {
notifyError(conf, err)
}
+ // Load notification state for batching (tracks when last email was sent
+ // and what the check states were at that time)
+ notifyStateData, err := newNotifyState(conf.StateDir)
+ if err != nil {
+ log.Println("warning: failed to load notification state:", err)
+ }
+
state = runChecks(ctx, state, conf)
state = mergePrometheusAlerts(ctx, state, conf)
state = mergeFederated(ctx, state, conf)
@@ -31,12 +38,35 @@ func Run(ctx context.Context, configFile string, renotify, force bool) {
}
subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf)
+
+ // Apply notification batching when MinNotifyIntervalS is configured.
+ // Force flag bypasses batching to allow immediate notifications when needed.
+ if doNotify && conf.MinNotifyIntervalS > 0 && !force {
+ if notifyStateData.intervalElapsed(conf.MinNotifyIntervalS) {
+ // Interval has elapsed - only notify if state changed since last notification
+ if !notifyStateData.hasChanges(state) {
+ doNotify = false
+ log.Println("Notification suppressed: interval elapsed but no state changes since last notification")
+ }
+ } else {
+ // Interval has not elapsed - suppress notification
+ doNotify = false
+ log.Println("Notification suppressed: minimum interval not elapsed")
+ }
+ }
+
if doNotify {
if err := notify(conf, subject, body); err != nil {
log.Println("error:", err)
- return
+ return nil
+ }
+ // Record notification timestamp and state snapshot for batching
+ if err := notifyStateData.recordNotification(state); err != nil {
+ log.Println("warning: failed to save notification state:", err)
}
}
+
+ // Text and HTML reports always update regardless of notification batching
if err := persistReport(subject, body, conf); err != nil {
notifyError(conf, err)
}
@@ -47,6 +77,8 @@ func Run(ctx context.Context, configFile string, renotify, force bool) {
notifyError(conf, err)
}
}
+
+ return nil
}
func persistReport(subject, body string, conf config) error {