1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
package internal
import (
"context"
"log"
"math/rand"
"sync"
"time"
)
func runChecks(ctx context.Context, state state, conf config) state {
var (
limitCh = make(chan struct{}, conf.CheckConcurrency)
inputCh = make(chan namedCheck)
outputCh = make(chan checkResult)
deps = newDependency(conf)
)
go func() {
for name, check := range conf.Checks {
inputCh <- namedCheck{check, name}
}
close(inputCh)
}()
var outputWg sync.WaitGroup
outputWg.Add(1)
go func() {
for checkResult := range outputCh {
state.update(checkResult)
}
outputWg.Done()
}()
var inputWg sync.WaitGroup
inputWg.Add(len(conf.Checks))
for check := range inputCh {
if age := state.age(check.name); check.RunInterval > int(age.Seconds()) {
lastCheckState, ok := state.checks[check.name]
if ok {
log.Printf("Skipping %s: interval not yet reached (%v (%v) <= %v)", check.name,
int(age.Seconds()), age, check.RunInterval)
outputCh <- checkResult{
name: check.name,
output: lastCheckState.Output,
epoch: lastCheckState.Epoch,
status: lastCheckState.Status,
federatedFrom: lastCheckState.FederatedFrom,
}
inputWg.Done()
continue
}
log.Println("Something went wrong... expected check state for", check,
"bug got nothing! Proceeding anyway")
}
go func(check namedCheck) {
outputCh <- runCheck(ctx, limitCh, deps, check, conf, check.Retries)
inputWg.Done()
}(check)
}
inputWg.Wait()
log.Println("All checks completed!")
close(outputCh)
outputWg.Wait()
log.Println("All outputs collected!")
return state
}
func runCheck(ctx context.Context, limitCh chan struct{}, deps dependency,
check namedCheck, conf config, retries int,
) checkResult {
if err := deps.wait(ctx, check.DependsOn); err != nil {
deps.notOk(check.name)
return check.skip(err.Error())
}
if check.RandomSpread > 0 {
d := time.Duration(rand.Intn(check.RandomSpread)) * time.Second
log.Printf("Sleeping %v before running %s", d, check.name)
time.Sleep(d)
}
limitCh <- struct{}{}
checkCtx, cancel := context.WithTimeout(ctx,
time.Duration(conf.CheckTimeoutS)*time.Second)
defer cancel()
checkResult := check.run(checkCtx)
if checkResult.status != nagiosOk && retries > 0 {
<-limitCh
retryDuration := time.Duration(check.RetryInterval) * time.Second
time.Sleep(retryDuration)
log.Printf("Retrying %s after %v", check.name, retryDuration)
return runCheck(ctx, limitCh, deps, check, conf, retries-1)
}
if checkResult.status == nagiosCritical {
deps.notOk(check.name)
} else {
deps.ok(check.name)
}
<-limitCh
return checkResult
}
|