From 81db39feafdc491658f8a10dc604be851a533ffc Mon Sep 17 00:00:00 2001 From: Paul Buetow Date: Sun, 8 Feb 2026 10:57:45 +0200 Subject: feat: add peer failover alerting Introduce peer URL monitoring with active/passive alert suppression, skip checks when passive, and bump version to v1.4.0. Co-authored-by: Cursor --- internal/config.go | 49 ++++++++++++----- internal/peer.go | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++ internal/peer_test.go | 83 +++++++++++++++++++++++++++++ internal/run.go | 25 +++++++-- internal/state.go | 17 +++++- internal/version.go | 2 +- 6 files changed, 303 insertions(+), 18 deletions(-) create mode 100644 internal/peer.go create mode 100644 internal/peer_test.go diff --git a/internal/config.go b/internal/config.go index ffc2353..21fbaec 100644 --- a/internal/config.go +++ b/internal/config.go @@ -5,22 +5,27 @@ import ( "fmt" "io" "log" + "net/url" "os" ) type config struct { - EmailTo string - EmailFrom string - SMTPServer string `json:"SMTPServer,omitempty"` - SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option - StateDir string `json:"StateDir,omitempty"` - HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file - HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation - StatusPageURL string `json:"StatusPageURL,omitempty"` // URL to the HTML status page for email notifications - CheckTimeoutS int - CheckConcurrency int - StaleThreshold int `json:"StaleThreshold,omitempty"` - Federated []string `json:"Federated,omitempty"` // TODO: Document this option + EmailTo string + EmailFrom string + SMTPServer string `json:"SMTPServer,omitempty"` + SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option + StateDir string `json:"StateDir,omitempty"` + HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file + HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation + StatusPageURL string `json:"StatusPageURL,omitempty"` // URL to the HTML status page for email notifications + PeerURL string `json:"PeerURL,omitempty"` // Peer Gogios JSON report URL + PeerStaleThresholdS int `json:"PeerStaleThresholdS,omitempty"` + PeerPrimaryName string `json:"PeerPrimaryName,omitempty"` + PeerSecondaryName string `json:"PeerSecondaryName,omitempty"` + CheckTimeoutS int + CheckConcurrency int + StaleThreshold int `json:"StaleThreshold,omitempty"` + Federated []string `json:"Federated,omitempty"` // TODO: Document this option // MinNotifyIntervalS is the minimum interval in seconds between email notifications. // When set > 0, Gogios batches notifications and only sends an email when: // 1. The interval has elapsed since the last notification, AND @@ -71,6 +76,26 @@ func newConfig(configFile string) (config, error) { conf.StaleThreshold = 3600 // Default to 1 hour } + if conf.PeerURL != "" { + if conf.PeerStaleThresholdS == 0 { + conf.PeerStaleThresholdS = 600 // Default to 10 minutes + } + + if conf.PeerPrimaryName == "" { + hostname, err := os.Hostname() + if err != nil { + log.Fatal(err) + } + conf.PeerPrimaryName = hostname + } + + if conf.PeerSecondaryName == "" { + if parsedURL, err := url.Parse(conf.PeerURL); err == nil && parsedURL.Hostname() != "" { + conf.PeerSecondaryName = parsedURL.Hostname() + } + } + } + if conf.PrometheusTimeoutS == 0 { conf.PrometheusTimeoutS = 2 // Default to 2 seconds } diff --git a/internal/peer.go b/internal/peer.go new file mode 100644 index 0000000..bc0d12e --- /dev/null +++ b/internal/peer.go @@ -0,0 +1,145 @@ +package internal + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "os" + "time" +) + +type peerReport struct { + LastUpdated string `json:"lastUpdated"` +} + +func peerActive(ctx context.Context, conf config) (bool, string) { + if conf.PeerURL == "" { + return true, "Peer failover: disabled (PeerURL not set)" + } + + hostname, err := os.Hostname() + if err != nil { + return true, fmt.Sprintf("Peer failover: hostname lookup failed (%v); staying active", err) + } + + return peerActiveAt(ctx, conf, time.Now(), hostname, fetchPeerLastUpdated) +} + +func peerActiveAt( + ctx context.Context, + conf config, + now time.Time, + hostname string, + fetch func(context.Context, string) (time.Time, error), +) (bool, string) { + if conf.PeerURL == "" { + return true, "Peer failover: disabled (PeerURL not set)" + } + + primary := conf.PeerPrimaryName + if primary == "" { + primary = hostname + } + + secondary := conf.PeerSecondaryName + if secondary == "" { + if parsedURL, err := url.Parse(conf.PeerURL); err == nil && parsedURL.Hostname() != "" { + secondary = parsedURL.Hostname() + } + } + + if primary == "" || secondary == "" { + return true, "Peer failover: missing peer names; staying active" + } + + if hostname != primary && hostname != secondary { + return true, fmt.Sprintf("Peer failover: local hostname %s not in [%s, %s]; staying active", + hostname, primary, secondary) + } + + staleThresholdS := conf.PeerStaleThresholdS + if staleThresholdS == 0 { + staleThresholdS = 600 + } + + lastUpdated, err := fetch(ctx, conf.PeerURL) + if err != nil { + return true, fmt.Sprintf("Peer failover: peer check failed (%v); staying active", err) + } + + age := now.Sub(lastUpdated) + if age > time.Duration(staleThresholdS)*time.Second { + return true, fmt.Sprintf("Peer failover: peer stale (%v > %ds); staying active", + age, staleThresholdS) + } + + master := scheduledMaster(primary, secondary, now) + if hostname == master { + return true, fmt.Sprintf("Peer failover: peer healthy; scheduled master is %s", master) + } + + return false, fmt.Sprintf("Peer failover: peer healthy; scheduled master is %s", master) +} + +func scheduledMaster(primary, secondary string, now time.Time) string { + week := weekNumberSunday(now) + if week%2 == 0 { + return secondary + } + return primary +} + +// weekNumberSunday matches strftime %U (Sunday-based week number, 00-53). +func weekNumberSunday(t time.Time) int { + tUTC := t.In(time.UTC) + yearStart := time.Date(tUTC.Year(), 1, 1, 0, 0, 0, 0, time.UTC) + + // Find the first Sunday on or after Jan 1. + daysUntilSunday := (7 - int(yearStart.Weekday())) % 7 + firstSunday := yearStart.AddDate(0, 0, daysUntilSunday) + + if tUTC.Before(firstSunday) { + return 0 + } + + daysSinceFirstSunday := int(tUTC.Sub(firstSunday).Hours() / 24) + return 1 + (daysSinceFirstSunday / 7) +} + +func fetchPeerLastUpdated(ctx context.Context, peerURL string) (time.Time, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, peerURL, nil) + if err != nil { + return time.Time{}, err + } + + client := http.Client{ + Timeout: 5 * time.Second, + } + + resp, err := client.Do(req) + if err != nil { + return time.Time{}, err + } + defer resp.Body.Close() + + if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices { + return time.Time{}, fmt.Errorf("unexpected status %d", resp.StatusCode) + } + + var report peerReport + if err := json.NewDecoder(resp.Body).Decode(&report); err != nil { + return time.Time{}, err + } + if report.LastUpdated == "" { + return time.Time{}, fmt.Errorf("missing lastUpdated") + } + + lastUpdated, err := time.Parse(time.RFC3339, report.LastUpdated) + if err != nil { + return time.Time{}, err + } + + return lastUpdated, nil +} diff --git a/internal/peer_test.go b/internal/peer_test.go new file mode 100644 index 0000000..2cada60 --- /dev/null +++ b/internal/peer_test.go @@ -0,0 +1,83 @@ +package internal + +import ( + "context" + "errors" + "testing" + "time" +) + +func TestPeerActiveAtStale(t *testing.T) { + now := time.Date(2023, 1, 1, 12, 0, 0, 0, time.UTC) + conf := config{ + PeerURL: "https://peer.example/gogios/index.json", + PeerStaleThresholdS: 600, + PeerPrimaryName: "primary", + PeerSecondaryName: "secondary", + } + + lastUpdated := now.Add(-11 * time.Minute) + active, _ := peerActiveAt(context.Background(), conf, now, "secondary", + func(context.Context, string) (time.Time, error) { + return lastUpdated, nil + }) + + if !active { + t.Fatalf("expected active when peer is stale") + } +} + +func TestPeerActiveAtFreshStandby(t *testing.T) { + now := time.Date(2023, 1, 1, 12, 0, 0, 0, time.UTC) + conf := config{ + PeerURL: "https://peer.example/gogios/index.json", + PeerStaleThresholdS: 600, + PeerPrimaryName: "primary", + PeerSecondaryName: "secondary", + } + + lastUpdated := now.Add(-1 * time.Minute) + active, _ := peerActiveAt(context.Background(), conf, now, "secondary", + func(context.Context, string) (time.Time, error) { + return lastUpdated, nil + }) + + if active { + t.Fatalf("expected passive when peer is healthy and local is standby") + } +} + +func TestPeerActiveAtFetchError(t *testing.T) { + now := time.Date(2023, 1, 1, 12, 0, 0, 0, time.UTC) + conf := config{ + PeerURL: "https://peer.example/gogios/index.json", + PeerStaleThresholdS: 600, + PeerPrimaryName: "primary", + PeerSecondaryName: "secondary", + } + + active, _ := peerActiveAt(context.Background(), conf, now, "secondary", + func(context.Context, string) (time.Time, error) { + return time.Time{}, errors.New("boom") + }) + + if !active { + t.Fatalf("expected active on peer fetch error") + } +} + +func TestScheduledMasterWeekParity(t *testing.T) { + primary := "primary" + secondary := "secondary" + + weekOne := time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC) // Sunday + weekTwo := time.Date(2023, 1, 8, 0, 0, 0, 0, time.UTC) // Next Sunday + + if master := scheduledMaster(primary, secondary, weekOne); master != primary { + t.Fatalf("expected primary to be master in week 1, got %s", master) + } + + if master := scheduledMaster(primary, secondary, weekTwo); master != secondary { + t.Fatalf("expected secondary to be master in week 2, got %s", master) + } +} diff --git a/internal/run.go b/internal/run.go index 21b9b6d..2548098 100644 --- a/internal/run.go +++ b/internal/run.go @@ -29,15 +29,27 @@ func Run(ctx context.Context, configFile string, renotify, force bool) error { log.Println("warning: failed to load notification state:", err) } - state = runChecks(ctx, state, conf) - state = mergePrometheusAlerts(ctx, state, conf) - state = mergeFederated(ctx, state, conf) + peerIsActive := true + peerReason := "" + if conf.PeerURL != "" { + peerIsActive, peerReason = peerActive(ctx, conf) + log.Println(peerReason) + } + + passive := !peerIsActive && !force + if passive { + log.Println("Skipping checks: peer is active") + } else { + state = runChecks(ctx, state, conf) + state = mergePrometheusAlerts(ctx, state, conf) + state = mergeFederated(ctx, state, conf) + } if err := state.persist(); err != nil { notifyError(conf, err) } - subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf) + subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf, passive, peerReason) // Apply notification batching when MinNotifyIntervalS is configured. // Force flag bypasses batching to allow immediate notifications when needed. @@ -55,6 +67,11 @@ func Run(ctx context.Context, configFile string, renotify, force bool) error { } } + if passive { + doNotify = false + log.Println("Notification suppressed: peer is active") + } + if doNotify { if err := notify(conf, subject, body); err != nil { log.Println("error:", err) diff --git a/internal/state.go b/internal/state.go index b9631b4..e33d5cb 100644 --- a/internal/state.go +++ b/internal/state.go @@ -133,10 +133,25 @@ func (s state) persist() error { // report generates the notification email content. // statusPageURL is included as a link to the HTML status page. // conf is used to determine which checks should be suppressed from the report. -func (s state) report(renotify, force bool, statusPageURL string, conf config) (string, string, bool) { +func (s state) report( + renotify, force bool, + statusPageURL string, + conf config, + passive bool, + passiveReason string, +) (string, string, bool) { var sb strings.Builder sb.WriteString("This is the recent Gogios report!\n\n") + if passive { + sb.WriteString("NOTE: Passive mode active, checks were skipped.\n") + if passiveReason != "" { + sb.WriteString("Reason: ") + sb.WriteString(passiveReason) + sb.WriteString("\n") + } + sb.WriteString("\n") + } sb.WriteString("# Alerts with status changed:\n\n") changed := s.reportChanged(&sb, conf) diff --git a/internal/version.go b/internal/version.go index 283d9cb..9f7a641 100644 --- a/internal/version.go +++ b/internal/version.go @@ -1,3 +1,3 @@ package internal -const Version = "v1.3.0" +const Version = "v1.4.0" -- cgit v1.2.3