minitor-go/monitor.go

158 lines
4.0 KiB
Go
Raw Normal View History

2019-09-21 22:03:26 +00:00
package main
import (
"log"
2019-10-02 16:37:29 +00:00
"math"
2019-09-21 22:03:26 +00:00
"os/exec"
"time"
)
2019-10-02 16:37:29 +00:00
// Monitor represents a particular periodic check of a command
2019-09-21 22:03:26 +00:00
type Monitor struct {
// Config values
Name string
Command []string
CommandShell string `yaml:"command_shell"`
2019-09-21 22:03:26 +00:00
AlertDown []string `yaml:"alert_down"`
AlertUp []string `yaml:"alert_up"`
CheckInterval float64 `yaml:"check_interval"`
AlertAfter int16 `yaml:"alert_after"`
2019-10-02 16:37:29 +00:00
AlertEvery int16 `yaml:"alert_every"`
2019-09-21 22:03:26 +00:00
// Other values
2019-10-02 16:37:29 +00:00
lastCheck time.Time
lastOutput string
alertCount int16
failureCount int16
lastSuccess time.Time
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
// IsValid returns a boolean indicating if the Monitor has been correctly
// configured
2019-09-21 22:03:26 +00:00
func (monitor Monitor) IsValid() bool {
atLeastOneCommand := (monitor.CommandShell != "" || monitor.Command != nil)
atMostOneCommand := (monitor.CommandShell == "" || monitor.Command == nil)
return atLeastOneCommand && atMostOneCommand && monitor.AlertAfter >= 0
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
// ShouldCheck returns a boolean indicating if the Monitor is ready to be
// be checked again
2019-09-21 22:03:26 +00:00
func (monitor Monitor) ShouldCheck() bool {
2019-10-02 16:37:29 +00:00
if monitor.lastCheck.IsZero() {
2019-09-21 22:03:26 +00:00
return true
}
2019-10-02 16:37:29 +00:00
sinceLastCheck := time.Now().Sub(monitor.lastCheck).Seconds()
2019-09-21 22:03:26 +00:00
return sinceLastCheck >= monitor.CheckInterval
}
2019-10-02 16:37:29 +00:00
// Check will run the command configured by the Monitor and return a status
// and a possible AlertNotice
func (monitor *Monitor) Check() (bool, *AlertNotice) {
2019-09-21 22:03:26 +00:00
var cmd *exec.Cmd
if monitor.Command != nil {
cmd = exec.Command(monitor.Command[0], monitor.Command[1:]...)
} else {
cmd = ShellCommand(monitor.CommandShell)
2019-09-21 22:03:26 +00:00
}
output, err := cmd.CombinedOutput()
2019-10-02 16:37:29 +00:00
//log.Printf("Check %s\n---\n%s\n---", monitor.Name, string(output))
2019-09-21 22:03:26 +00:00
2019-10-02 16:37:29 +00:00
isSuccess := (err == nil)
2019-09-21 22:03:26 +00:00
if err != nil {
2019-10-02 16:37:29 +00:00
log.Printf("ERROR: %v", err)
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
monitor.lastCheck = time.Now()
monitor.lastOutput = string(output)
2019-09-21 22:03:26 +00:00
2019-10-02 16:37:29 +00:00
var alertNotice *AlertNotice
if isSuccess {
alertNotice = monitor.success()
2019-09-21 22:03:26 +00:00
} else {
2019-10-02 16:37:29 +00:00
alertNotice = monitor.failure()
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
log.Printf(
"Check result for %s: %v, %v at %v",
monitor.Name,
isSuccess,
alertNotice,
monitor.lastCheck,
)
return isSuccess, alertNotice
}
func (monitor Monitor) isUp() bool {
return monitor.alertCount == 0
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
func (monitor *Monitor) success() (notice *AlertNotice) {
2019-09-21 22:03:26 +00:00
log.Printf("Great success!")
2019-10-02 16:37:29 +00:00
if !monitor.isUp() {
// Alert that we have recovered
notice = monitor.createAlertNotice(true)
}
monitor.failureCount = 0
monitor.alertCount = 0
monitor.lastSuccess = time.Now()
return
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
func (monitor *Monitor) failure() (notice *AlertNotice) {
2019-09-21 22:03:26 +00:00
log.Printf("Devastating failure. :(")
2019-10-02 16:37:29 +00:00
monitor.failureCount++
// If we haven't hit the minimum failures, we can exit
if monitor.failureCount < monitor.AlertAfter {
// TODO: Turn into a debug
log.Printf(
"Have not hit minimum failures. failures: %v alert after: %v",
monitor.failureCount,
monitor.AlertAfter,
)
return
}
failureCount := (monitor.failureCount - monitor.AlertAfter)
log.Printf("Total fail %v, this fail %v", monitor.failureCount, failureCount)
2019-10-02 16:37:29 +00:00
if monitor.AlertEvery > 0 {
// Handle integer number of failures before alerting
modVal := failureCount % monitor.AlertEvery
log.Printf("Alert every > 0: Mod val: %v", modVal)
2019-10-02 16:37:29 +00:00
if failureCount%monitor.AlertEvery == 0 {
notice = monitor.createAlertNotice(false)
}
} else if monitor.AlertEvery == 0 {
// Handle alerting on first failure only
if failureCount == 1 {
notice = monitor.createAlertNotice(false)
}
} else {
// Handle negative numbers indicating an exponential backoff
if failureCount >= int16(math.Pow(2, float64(monitor.alertCount))-1) {
notice = monitor.createAlertNotice(false)
}
}
if notice != nil {
monitor.alertCount++
}
return
}
func (monitor Monitor) createAlertNotice(isUp bool) *AlertNotice {
// TODO: Maybe add something about recovery status here
return &AlertNotice{
MonitorName: monitor.Name,
AlertCount: monitor.alertCount,
FailureCount: monitor.failureCount,
LastCheckOutput: monitor.lastOutput,
LastSuccess: monitor.lastSuccess,
IsUp: isUp,
}
2019-09-21 22:03:26 +00:00
}