summaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
Diffstat (limited to 'internal')
-rw-r--r--internal/config.go49
-rw-r--r--internal/peer.go145
-rw-r--r--internal/peer_test.go83
-rw-r--r--internal/run.go25
-rw-r--r--internal/state.go17
-rw-r--r--internal/version.go2
6 files changed, 303 insertions, 18 deletions
diff --git a/internal/config.go b/internal/config.go
index ffc2353..21fbaec 100644
--- a/internal/config.go
+++ b/internal/config.go
@@ -5,22 +5,27 @@ import (
"fmt"
"io"
"log"
+ "net/url"
"os"
)
type config struct {
- EmailTo string
- EmailFrom string
- SMTPServer string `json:"SMTPServer,omitempty"`
- SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option
- StateDir string `json:"StateDir,omitempty"`
- HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file
- HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation
- StatusPageURL string `json:"StatusPageURL,omitempty"` // URL to the HTML status page for email notifications
- CheckTimeoutS int
- CheckConcurrency int
- StaleThreshold int `json:"StaleThreshold,omitempty"`
- Federated []string `json:"Federated,omitempty"` // TODO: Document this option
+ EmailTo string
+ EmailFrom string
+ SMTPServer string `json:"SMTPServer,omitempty"`
+ SMTPDisable bool `json:"SMTPDisable,omitempty"` // TODO: Document this option
+ StateDir string `json:"StateDir,omitempty"`
+ HTMLStatusFile string `json:"HTMLStatusFile,omitempty"` // Path to HTML status file
+ HTMLDisable bool `json:"HTMLDisable,omitempty"` // Disable HTML status page generation
+ StatusPageURL string `json:"StatusPageURL,omitempty"` // URL to the HTML status page for email notifications
+ PeerURL string `json:"PeerURL,omitempty"` // Peer Gogios JSON report URL
+ PeerStaleThresholdS int `json:"PeerStaleThresholdS,omitempty"`
+ PeerPrimaryName string `json:"PeerPrimaryName,omitempty"`
+ PeerSecondaryName string `json:"PeerSecondaryName,omitempty"`
+ CheckTimeoutS int
+ CheckConcurrency int
+ StaleThreshold int `json:"StaleThreshold,omitempty"`
+ Federated []string `json:"Federated,omitempty"` // TODO: Document this option
// MinNotifyIntervalS is the minimum interval in seconds between email notifications.
// When set > 0, Gogios batches notifications and only sends an email when:
// 1. The interval has elapsed since the last notification, AND
@@ -71,6 +76,26 @@ func newConfig(configFile string) (config, error) {
conf.StaleThreshold = 3600 // Default to 1 hour
}
+ if conf.PeerURL != "" {
+ if conf.PeerStaleThresholdS == 0 {
+ conf.PeerStaleThresholdS = 600 // Default to 10 minutes
+ }
+
+ if conf.PeerPrimaryName == "" {
+ hostname, err := os.Hostname()
+ if err != nil {
+ log.Fatal(err)
+ }
+ conf.PeerPrimaryName = hostname
+ }
+
+ if conf.PeerSecondaryName == "" {
+ if parsedURL, err := url.Parse(conf.PeerURL); err == nil && parsedURL.Hostname() != "" {
+ conf.PeerSecondaryName = parsedURL.Hostname()
+ }
+ }
+ }
+
if conf.PrometheusTimeoutS == 0 {
conf.PrometheusTimeoutS = 2 // Default to 2 seconds
}
diff --git a/internal/peer.go b/internal/peer.go
new file mode 100644
index 0000000..bc0d12e
--- /dev/null
+++ b/internal/peer.go
@@ -0,0 +1,145 @@
+package internal
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "net/url"
+ "os"
+ "time"
+)
+
+type peerReport struct {
+ LastUpdated string `json:"lastUpdated"`
+}
+
+func peerActive(ctx context.Context, conf config) (bool, string) {
+ if conf.PeerURL == "" {
+ return true, "Peer failover: disabled (PeerURL not set)"
+ }
+
+ hostname, err := os.Hostname()
+ if err != nil {
+ return true, fmt.Sprintf("Peer failover: hostname lookup failed (%v); staying active", err)
+ }
+
+ return peerActiveAt(ctx, conf, time.Now(), hostname, fetchPeerLastUpdated)
+}
+
+func peerActiveAt(
+ ctx context.Context,
+ conf config,
+ now time.Time,
+ hostname string,
+ fetch func(context.Context, string) (time.Time, error),
+) (bool, string) {
+ if conf.PeerURL == "" {
+ return true, "Peer failover: disabled (PeerURL not set)"
+ }
+
+ primary := conf.PeerPrimaryName
+ if primary == "" {
+ primary = hostname
+ }
+
+ secondary := conf.PeerSecondaryName
+ if secondary == "" {
+ if parsedURL, err := url.Parse(conf.PeerURL); err == nil && parsedURL.Hostname() != "" {
+ secondary = parsedURL.Hostname()
+ }
+ }
+
+ if primary == "" || secondary == "" {
+ return true, "Peer failover: missing peer names; staying active"
+ }
+
+ if hostname != primary && hostname != secondary {
+ return true, fmt.Sprintf("Peer failover: local hostname %s not in [%s, %s]; staying active",
+ hostname, primary, secondary)
+ }
+
+ staleThresholdS := conf.PeerStaleThresholdS
+ if staleThresholdS == 0 {
+ staleThresholdS = 600
+ }
+
+ lastUpdated, err := fetch(ctx, conf.PeerURL)
+ if err != nil {
+ return true, fmt.Sprintf("Peer failover: peer check failed (%v); staying active", err)
+ }
+
+ age := now.Sub(lastUpdated)
+ if age > time.Duration(staleThresholdS)*time.Second {
+ return true, fmt.Sprintf("Peer failover: peer stale (%v > %ds); staying active",
+ age, staleThresholdS)
+ }
+
+ master := scheduledMaster(primary, secondary, now)
+ if hostname == master {
+ return true, fmt.Sprintf("Peer failover: peer healthy; scheduled master is %s", master)
+ }
+
+ return false, fmt.Sprintf("Peer failover: peer healthy; scheduled master is %s", master)
+}
+
+func scheduledMaster(primary, secondary string, now time.Time) string {
+ week := weekNumberSunday(now)
+ if week%2 == 0 {
+ return secondary
+ }
+ return primary
+}
+
+// weekNumberSunday matches strftime %U (Sunday-based week number, 00-53).
+func weekNumberSunday(t time.Time) int {
+ tUTC := t.In(time.UTC)
+ yearStart := time.Date(tUTC.Year(), 1, 1, 0, 0, 0, 0, time.UTC)
+
+ // Find the first Sunday on or after Jan 1.
+ daysUntilSunday := (7 - int(yearStart.Weekday())) % 7
+ firstSunday := yearStart.AddDate(0, 0, daysUntilSunday)
+
+ if tUTC.Before(firstSunday) {
+ return 0
+ }
+
+ daysSinceFirstSunday := int(tUTC.Sub(firstSunday).Hours() / 24)
+ return 1 + (daysSinceFirstSunday / 7)
+}
+
+func fetchPeerLastUpdated(ctx context.Context, peerURL string) (time.Time, error) {
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, peerURL, nil)
+ if err != nil {
+ return time.Time{}, err
+ }
+
+ client := http.Client{
+ Timeout: 5 * time.Second,
+ }
+
+ resp, err := client.Do(req)
+ if err != nil {
+ return time.Time{}, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices {
+ return time.Time{}, fmt.Errorf("unexpected status %d", resp.StatusCode)
+ }
+
+ var report peerReport
+ if err := json.NewDecoder(resp.Body).Decode(&report); err != nil {
+ return time.Time{}, err
+ }
+ if report.LastUpdated == "" {
+ return time.Time{}, fmt.Errorf("missing lastUpdated")
+ }
+
+ lastUpdated, err := time.Parse(time.RFC3339, report.LastUpdated)
+ if err != nil {
+ return time.Time{}, err
+ }
+
+ return lastUpdated, nil
+}
diff --git a/internal/peer_test.go b/internal/peer_test.go
new file mode 100644
index 0000000..2cada60
--- /dev/null
+++ b/internal/peer_test.go
@@ -0,0 +1,83 @@
+package internal
+
+import (
+ "context"
+ "errors"
+ "testing"
+ "time"
+)
+
+func TestPeerActiveAtStale(t *testing.T) {
+ now := time.Date(2023, 1, 1, 12, 0, 0, 0, time.UTC)
+ conf := config{
+ PeerURL: "https://peer.example/gogios/index.json",
+ PeerStaleThresholdS: 600,
+ PeerPrimaryName: "primary",
+ PeerSecondaryName: "secondary",
+ }
+
+ lastUpdated := now.Add(-11 * time.Minute)
+ active, _ := peerActiveAt(context.Background(), conf, now, "secondary",
+ func(context.Context, string) (time.Time, error) {
+ return lastUpdated, nil
+ })
+
+ if !active {
+ t.Fatalf("expected active when peer is stale")
+ }
+}
+
+func TestPeerActiveAtFreshStandby(t *testing.T) {
+ now := time.Date(2023, 1, 1, 12, 0, 0, 0, time.UTC)
+ conf := config{
+ PeerURL: "https://peer.example/gogios/index.json",
+ PeerStaleThresholdS: 600,
+ PeerPrimaryName: "primary",
+ PeerSecondaryName: "secondary",
+ }
+
+ lastUpdated := now.Add(-1 * time.Minute)
+ active, _ := peerActiveAt(context.Background(), conf, now, "secondary",
+ func(context.Context, string) (time.Time, error) {
+ return lastUpdated, nil
+ })
+
+ if active {
+ t.Fatalf("expected passive when peer is healthy and local is standby")
+ }
+}
+
+func TestPeerActiveAtFetchError(t *testing.T) {
+ now := time.Date(2023, 1, 1, 12, 0, 0, 0, time.UTC)
+ conf := config{
+ PeerURL: "https://peer.example/gogios/index.json",
+ PeerStaleThresholdS: 600,
+ PeerPrimaryName: "primary",
+ PeerSecondaryName: "secondary",
+ }
+
+ active, _ := peerActiveAt(context.Background(), conf, now, "secondary",
+ func(context.Context, string) (time.Time, error) {
+ return time.Time{}, errors.New("boom")
+ })
+
+ if !active {
+ t.Fatalf("expected active on peer fetch error")
+ }
+}
+
+func TestScheduledMasterWeekParity(t *testing.T) {
+ primary := "primary"
+ secondary := "secondary"
+
+ weekOne := time.Date(2023, 1, 1, 0, 0, 0, 0, time.UTC) // Sunday
+ weekTwo := time.Date(2023, 1, 8, 0, 0, 0, 0, time.UTC) // Next Sunday
+
+ if master := scheduledMaster(primary, secondary, weekOne); master != primary {
+ t.Fatalf("expected primary to be master in week 1, got %s", master)
+ }
+
+ if master := scheduledMaster(primary, secondary, weekTwo); master != secondary {
+ t.Fatalf("expected secondary to be master in week 2, got %s", master)
+ }
+}
diff --git a/internal/run.go b/internal/run.go
index 21b9b6d..2548098 100644
--- a/internal/run.go
+++ b/internal/run.go
@@ -29,15 +29,27 @@ func Run(ctx context.Context, configFile string, renotify, force bool) error {
log.Println("warning: failed to load notification state:", err)
}
- state = runChecks(ctx, state, conf)
- state = mergePrometheusAlerts(ctx, state, conf)
- state = mergeFederated(ctx, state, conf)
+ peerIsActive := true
+ peerReason := ""
+ if conf.PeerURL != "" {
+ peerIsActive, peerReason = peerActive(ctx, conf)
+ log.Println(peerReason)
+ }
+
+ passive := !peerIsActive && !force
+ if passive {
+ log.Println("Skipping checks: peer is active")
+ } else {
+ state = runChecks(ctx, state, conf)
+ state = mergePrometheusAlerts(ctx, state, conf)
+ state = mergeFederated(ctx, state, conf)
+ }
if err := state.persist(); err != nil {
notifyError(conf, err)
}
- subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf)
+ subject, body, doNotify := state.report(renotify, force, conf.StatusPageURL, conf, passive, peerReason)
// Apply notification batching when MinNotifyIntervalS is configured.
// Force flag bypasses batching to allow immediate notifications when needed.
@@ -55,6 +67,11 @@ func Run(ctx context.Context, configFile string, renotify, force bool) error {
}
}
+ if passive {
+ doNotify = false
+ log.Println("Notification suppressed: peer is active")
+ }
+
if doNotify {
if err := notify(conf, subject, body); err != nil {
log.Println("error:", err)
diff --git a/internal/state.go b/internal/state.go
index b9631b4..e33d5cb 100644
--- a/internal/state.go
+++ b/internal/state.go
@@ -133,10 +133,25 @@ func (s state) persist() error {
// report generates the notification email content.
// statusPageURL is included as a link to the HTML status page.
// conf is used to determine which checks should be suppressed from the report.
-func (s state) report(renotify, force bool, statusPageURL string, conf config) (string, string, bool) {
+func (s state) report(
+ renotify, force bool,
+ statusPageURL string,
+ conf config,
+ passive bool,
+ passiveReason string,
+) (string, string, bool) {
var sb strings.Builder
sb.WriteString("This is the recent Gogios report!\n\n")
+ if passive {
+ sb.WriteString("NOTE: Passive mode active, checks were skipped.\n")
+ if passiveReason != "" {
+ sb.WriteString("Reason: ")
+ sb.WriteString(passiveReason)
+ sb.WriteString("\n")
+ }
+ sb.WriteString("\n")
+ }
sb.WriteString("# Alerts with status changed:\n\n")
changed := s.reportChanged(&sb, conf)
diff --git a/internal/version.go b/internal/version.go
index 283d9cb..9f7a641 100644
--- a/internal/version.go
+++ b/internal/version.go
@@ -1,3 +1,3 @@
package internal
-const Version = "v1.3.0"
+const Version = "v1.4.0"