diff options
| author | Paul Buetow <paul@buetow.org> | 2025-10-27 23:36:49 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2025-10-27 23:36:49 +0200 |
| commit | 81d1550df55318beff8e9f762952a33daaa7c0cf (patch) | |
| tree | 897e3c044c8e3bf5c9d71d98345fde9a645e8c7a | |
| parent | 6352e8c33c1c22af382093d406d477d1530950db (diff) | |
feat: Add randomSpread and RunInterval to checks
This commit introduces two new optional parameters to the check configuration:
- `randomSpread`: This parameter allows specifying a random sleep time up to N seconds before a check is executed. This is useful to avoid all checks running at the same time.
- `RunInterval`: This parameter defines the minimum interval in seconds between two executions of a check. This is useful if gogios is run more frequently than a specific check should be.
The `README.md` has been updated to document these new features.
fix: Fix deadlock when skipping checks
This commit also fixes a deadlock that occurred when a check was skipped due to the `RunInterval` setting. The `inputWg.Done()` was not being called, causing the main goroutine to wait forever.
build: Replace Taskfile with Magefile
The `Taskfile.yml` has been replaced with a `Magefile.go` to manage the build process. This provides more flexibility and is more idiomatic for Go projects.
| -rw-r--r-- | .gitignore | 3 | ||||
| -rw-r--r-- | Magefile.go | 105 | ||||
| -rw-r--r-- | README.md | 14 | ||||
| -rw-r--r-- | internal/check.go | 4 | ||||
| -rw-r--r-- | internal/runchecks.go | 34 | ||||
| -rw-r--r-- | internal/state.go | 20 | ||||
| -rw-r--r-- | internal/state_test.go | 24 |
7 files changed, 188 insertions, 16 deletions
@@ -1 +1,2 @@ -dist/ +gogios +dist/
\ No newline at end of file diff --git a/Magefile.go b/Magefile.go new file mode 100644 index 0000000..dc7d90b --- /dev/null +++ b/Magefile.go @@ -0,0 +1,105 @@ +//go:build mage +// +build mage + +package main + +import ( + "fmt" + "os" + "os/exec" + + "github.com/magefile/mage/mg" +) + +// Build builds the gogios binary. +func Build() error { + fmt.Println("Building...") + cmd := exec.Command("go", "build", "-o", "gogios", "cmd/gogios/main.go") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// Dev builds the gogios binary with race detection. +func Dev() error { + mg.Deps(Vet, Lint) + fmt.Println("Building with race detector...") + cmd := exec.Command("go", "build", "-race", "-o", "gogios", "cmd/gogios/main.go") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// Vet runs go vet on all go files. +func Vet() error { + fmt.Println("Vetting...") + cmd := exec.Command("go", "vet", "./...") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// Lint runs golangci-lint. +func Lint() error { + fmt.Println("Linting...") + cmd := exec.Command("golangci-lint", "run") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// LintInstall installs golangci-lint. +func LintInstall() error { + fmt.Println("Installing golangci-lint...") + cmd := exec.Command("go", "install", "github.com/golangci/golangci-lint/cmd/golangci-lint@latest") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// Test runs all unit tests. +func Test() error { + fmt.Println("Cleaning test cache...") + cleanCmd := exec.Command("go", "clean", "-testcache") + cleanCmd.Stdout = os.Stdout + cleanCmd.Stderr = os.Stderr + if err := cleanCmd.Run(); err != nil { + return err + } + + fmt.Println("Running tests...") + testCmd := exec.Command("go", "test", "./...") + testCmd.Stdout = os.Stdout + testCmd.Stderr = os.Stderr + return testCmd.Run() +} + +// Openbsd builds and deploys the gogios binary for OpenBSD. +func Openbsd() error { + mg.Deps(BuildOpenbsd, DeployOpenbsd) + return nil +} + +// BuildOpenbsd builds the gogios binary for OpenBSD. +func BuildOpenbsd() error { + fmt.Println("Building for OpenBSD...") + if err := os.Setenv("GOOS", "openbsd"); err != nil { + return err + } + if err := os.Setenv("GOARCH", "amd64"); err != nil { + return err + } + cmd := exec.Command("go", "build", "-o", "gogios", "cmd/gogios/main.go") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// DeployOpenbsd copies the gogios binary for OpenBSD. +func DeployOpenbsd() error { + fmt.Println("Copying binary...") + cpCmd := exec.Command("cp", "gogios", "/home/paul/git/conf/frontends/usr/local/bin/gogios") + cpCmd.Stdout = os.Stdout + cpCmd.Stderr = os.Stderr + return cpCmd.Run() +} @@ -117,13 +117,15 @@ To configure Gogios, create a JSON configuration file (e.g., `/etc/gogios.json`) "Plugin": "/usr/local/libexec/nagios/check_ping", "Args": [ "-H", "www.foo.zone", "-4", "-w", "50,10%", "-c", "100,15%" ], "Retries": 3, - "RetryInterval": 10 + "RetryInterval": 10, + "RandomSpread": 60 }, "Check ICMP6 www.foo.zone": { "Plugin": "/usr/local/libexec/nagios/check_ping", - "Args": [ "-H", "www.foo.zone", "-6", "-w", "50,10%", "-c", "100,15%" ] + "Args": [ "-H", "www.foo.zone", "-6", "-w", "50,10%", "-c", "100,15%" ], "Retries": 3, - "RetryInterval": 10 + "RetryInterval": 10, + "RunInterval": 300 }, "www.foo.zone HTTP IPv4": { "Plugin": "/usr/local/libexec/nagios/check_http", @@ -157,6 +159,10 @@ If you want to execute checks only when another check succeeded (status OK), use `Retries` and `RetryInterval` are optional check configuration parameters. In case of failure, Gogios will retry `Retries` times each `RetryInterval` seconds. +`RandomSpread` is an optional check configuration parameter. It will cause a random sleep of up to N seconds (specified by config by each check) before the check is being executed. This is useful to avoid all checks running at the same time. + +`RunInterval` is an optional check configuration parameter. It defines the minimum interval in seconds between two executions of a check. This is useful if you run gogios more frequently than you want to run a specific check. + For remote checks, use the `check_nrpe` plugin. You also need to have the NRPE server set up correctly on the target host (out of scope for this document). The `state.json` file mentioned above keeps track of the monitoring state and check results between Gogios runs, enabling Gogios only to send email notifications when there are changes in the check status. @@ -202,4 +208,4 @@ My primary goal was to have a single email address for notifications and a simpl This led me to create Gogios, a lightweight monitoring tool tailored to my specific needs. I chose the Go programming language for this project as it allowed me to refresh my Go programming skills and provided a robust platform for developing a fast and efficient monitoring tool. -Gogios eliminates unnecessary features and focuses on simplicity, providing a no-frills monitoring solution for small-scale self-hosted servers and virtual machines. The result is a tool that is easy to configure, set up, and maintain, ensuring that monitoring your resources is as hassle-free as possible. +Gogios eliminates unnecessary features and focuses on simplicity, providing a no-frills monitoring solution for small-scale self-hosted servers and virtual machines. The result is a tool that is easy to configure, set up, and maintain, ensuring that monitoring your resources is as hassle-free as possible.
\ No newline at end of file diff --git a/internal/check.go b/internal/check.go index 70f0044..3f2e4cc 100644 --- a/internal/check.go +++ b/internal/check.go @@ -14,6 +14,8 @@ type check struct { DependsOn []string `json:"DependsOn,omitempty"` Retries int `json:"Retries,omitempty"` RetryInterval int `json:"RetryInterval,omitempty"` + RunInterval int `json:"RunInterval,omitempty"` + RandomSpread int `json:"RandomSpread,omitempty"` } type namedCheck struct { @@ -65,4 +67,4 @@ func (c namedCheck) run(ctx context.Context) checkResult { func (c namedCheck) skip(output string) checkResult { return c.check.skip(c.name, output) -} +}
\ No newline at end of file diff --git a/internal/runchecks.go b/internal/runchecks.go index 788e77d..fb7a9c4 100644 --- a/internal/runchecks.go +++ b/internal/runchecks.go @@ -3,6 +3,7 @@ package internal import ( "context" "log" + "math/rand" "sync" "time" ) @@ -36,6 +37,25 @@ func runChecks(ctx context.Context, state state, conf config) state { inputWg.Add(len(conf.Checks)) for check := range inputCh { + if age := state.age(check.name); check.RunInterval > int(age.Seconds()) { + lastCheckState, ok := state.checks[check.name] + if ok { + log.Printf("Skipping %s: interval not yet reached (%v (%v) <= %v)", check.name, + int(age.Seconds()), age, check.RunInterval) + outputCh <- checkResult{ + name: check.name, + output: lastCheckState.output, + epoch: lastCheckState.Epoch, + status: lastCheckState.Status, + federated: lastCheckState.federated, + } + inputWg.Done() + continue + } + log.Println("Something went wrong... expected check state for", check, + "bug got nothing! Proceeding anyway") + } + go func(check namedCheck) { outputCh <- runCheck(ctx, limitCh, deps, check, conf, check.Retries) inputWg.Done() @@ -52,14 +72,20 @@ func runChecks(ctx context.Context, state state, conf config) state { return state } -func runCheck(ctx context.Context, limitCh chan struct{}, - deps dependency, check namedCheck, conf config, retries int) checkResult { - +func runCheck(ctx context.Context, limitCh chan struct{}, deps dependency, + check namedCheck, conf config, retries int, +) checkResult { if err := deps.wait(ctx, check.DependsOn); err != nil { deps.notOk(check.name) return check.skip(err.Error()) } + if check.RandomSpread > 0 { + d := time.Duration(rand.Intn(check.RandomSpread)) * time.Second + log.Printf("Sleeping %v before running %s", d, check.name) + time.Sleep(d) + } + limitCh <- struct{}{} checkCtx, cancel := context.WithTimeout(ctx, @@ -84,4 +110,4 @@ func runCheck(ctx context.Context, limitCh chan struct{}, <-limitCh return checkResult -} +}
\ No newline at end of file diff --git a/internal/state.go b/internal/state.go index 8de7f15..dceb108 100644 --- a/internal/state.go +++ b/internal/state.go @@ -83,6 +83,14 @@ func (s state) update(result checkResult) { log.Println(result.name, cs) } +func (s state) age(name string) time.Duration { + if prevState, ok := s.checks[name]; ok { + return time.Since(time.Unix(prevState.Epoch, 0)) + } + + return time.Duration(0) +} + // To be used to merge the state of another server running Gogios func (s state) merge(other state) error { for name, cs := range other.checks { @@ -105,7 +113,7 @@ func (s state) mergeFromBytes(bytes []byte) error { func (s state) persist() error { stateDir := filepath.Dir(s.stateFile) if _, err := os.Stat(stateDir); os.IsNotExist(err) { - if err := os.MkdirAll(stateDir, 0755); err != nil { + if err := os.MkdirAll(stateDir, 0o755); err != nil { return err } } @@ -180,8 +188,8 @@ func (s state) reportChanged(sb *strings.Builder) (changed bool) { } func (s state) reportUnhandled(sb *strings.Builder) (numCriticals, numWarnings, - numUnknown, numOK int) { - + numUnknown, numOK int, +) { numCriticals = s.reportBy(sb, false, false, func(cs checkState) bool { return cs.Status == nagiosCritical }) @@ -208,8 +216,8 @@ func (s state) reportStaleAlerts(sb *strings.Builder) int { } func (s state) reportBy(sb *strings.Builder, showStatusChange, isStaleReport bool, - filter func(cs checkState) bool) (count int) { - + filter func(cs checkState) bool, +) (count int) { for name, cs := range s.checks { if !filter(cs) { continue @@ -254,4 +262,4 @@ func (s state) countBy(filter func(cs checkState) bool) (count int) { } } return -} +}
\ No newline at end of file diff --git a/internal/state_test.go b/internal/state_test.go new file mode 100644 index 0000000..aacc023 --- /dev/null +++ b/internal/state_test.go @@ -0,0 +1,24 @@ +package internal + +import ( + "testing" + "time" +) + +func TestAge(t *testing.T) { + state := state{checks: make(map[string]checkState)} + + state.checks["Check Foo"] = checkState{Epoch: 0} + minAge := time.Duration(time.Now().Unix()) + + if reportedAge := state.age("Check Foo"); reportedAge < minAge { + t.Errorf("expected age >= %v, got %v", minAge, reportedAge) + } + + maxAge := time.Duration(time.Now().Unix()) + state.checks["Check Bar"] = checkState{Epoch: time.Now().Unix()} + + if reportedAge := state.age("Check Bar"); reportedAge >= minAge { + t.Errorf("expected age < %v, got %v", maxAge, reportedAge) + } +} |
