summaryrefslogtreecommitdiff
path: root/internal/runchecks.go
blob: 6d88f547ff0fa9706d5da475e6ea7e173f8f54d1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package internal

import (
	"context"
	"log"
	"math/rand"
	"sync"
	"time"
)

func runChecks(ctx context.Context, state state, conf config) state {
	var (
		limitCh  = make(chan struct{}, conf.CheckConcurrency)
		inputCh  = make(chan namedCheck)
		outputCh = make(chan checkResult)
		deps     = newDependency(conf)
	)

	go func() {
		for name, check := range conf.Checks {
			inputCh <- namedCheck{check, name}
		}
		close(inputCh)
	}()

	var outputWg sync.WaitGroup
	outputWg.Add(1)

	go func() {
		for checkResult := range outputCh {
			state.update(checkResult)
		}
		outputWg.Done()
	}()

	var inputWg sync.WaitGroup
	inputWg.Add(len(conf.Checks))

	for check := range inputCh {
		if age := state.age(check.name); check.RunInterval > int(age.Seconds()) {
			lastCheckState, ok := state.checks[check.name]
			if ok {
				log.Printf("Skipping %s: interval not yet reached (%v (%v) <= %v)", check.name,
					int(age.Seconds()), age, check.RunInterval)
				outputCh <- checkResult{
					name:          check.name,
					output:        lastCheckState.Output,
					epoch:         lastCheckState.Epoch,
					status:        lastCheckState.Status,
					federatedFrom: lastCheckState.FederatedFrom,
				}
				inputWg.Done()
				continue
			}
			log.Println("Something went wrong... expected check state for", check,
				"bug got nothing! Proceeding anyway")
		}

		go func(check namedCheck) {
			outputCh <- runCheck(ctx, limitCh, deps, check, conf, check.Retries)
			inputWg.Done()
		}(check)
	}

	inputWg.Wait()
	log.Println("All checks completed!")
	close(outputCh)

	outputWg.Wait()
	log.Println("All outputs collected!")

	return state
}

func runCheck(ctx context.Context, limitCh chan struct{}, deps dependency,
	check namedCheck, conf config, retries int,
) checkResult {
	if err := deps.wait(ctx, check.DependsOn); err != nil {
		deps.notOk(check.name)
		return check.skip(err.Error())
	}

	if check.RandomSpread > 0 {
		d := time.Duration(rand.Intn(check.RandomSpread)) * time.Second
		log.Printf("Sleeping %v before running %s", d, check.name)
		time.Sleep(d)
	}

	limitCh <- struct{}{}

	checkCtx, cancel := context.WithTimeout(ctx,
		time.Duration(conf.CheckTimeoutS)*time.Second)
	defer cancel()

	checkResult := check.run(checkCtx)

	if checkResult.status != nagiosOk && retries > 0 {
		<-limitCh
		retryDuration := time.Duration(check.RetryInterval) * time.Second
		time.Sleep(retryDuration)
		log.Printf("Retrying %s after %v", check.name, retryDuration)
		return runCheck(ctx, limitCh, deps, check, conf, retries-1)
	}

	if checkResult.status == nagiosCritical {
		deps.notOk(check.name)
	} else {
		deps.ok(check.name)
	}

	<-limitCh
	return checkResult
}