minitor-go/monitor.go

258 lines
6.6 KiB
Go
Raw Permalink Normal View History

2019-09-21 22:03:26 +00:00
package main
import (
"errors"
"fmt"
2019-10-02 16:37:29 +00:00
"math"
2019-09-21 22:03:26 +00:00
"os/exec"
"time"
2021-05-11 03:12:18 +00:00
"git.iamthefij.com/iamthefij/slog"
2019-09-21 22:03:26 +00:00
)
2019-10-02 16:37:29 +00:00
// Monitor represents a particular periodic check of a command
2021-05-11 04:00:58 +00:00
type Monitor struct { //nolint:maligned
2019-09-21 22:03:26 +00:00
// Config values
2022-01-27 00:34:31 +00:00
CheckIntervalStr *string `hcl:"check_interval,optional"`
CheckInterval time.Duration
Name string `hcl:"name,label"`
AlertAfter int `hcl:"alert_after,optional"`
2022-01-27 00:34:31 +00:00
AlertEvery *int `hcl:"alert_every,optional"`
AlertDown []string `hcl:"alert_down,optional"`
AlertUp []string `hcl:"alert_up,optional"`
Command []string `hcl:"command,optional"`
ShellCommand string `hcl:"shell_command,optional"`
2021-05-11 04:00:58 +00:00
2019-09-21 22:03:26 +00:00
// Other values
2022-01-27 00:34:31 +00:00
alertCount int
failureCount int
2021-05-11 17:41:22 +00:00
lastCheck time.Time
lastSuccess time.Time
lastOutput string
lastCheckDuration time.Duration
2019-09-21 22:03:26 +00:00
}
// Init initializes the Monitor with default values
func (monitor *Monitor) Init(defaultAlertAfter int, defaultAlertEvery *int, defaultAlertDown []string, defaultAlertUp []string) error {
// Parse the check_interval string into a time.Duration
if monitor.CheckIntervalStr != nil {
var err error
monitor.CheckInterval, err = time.ParseDuration(*monitor.CheckIntervalStr)
if err != nil {
return fmt.Errorf("failed to parse check_interval duration for monitor %s: %w", monitor.Name, err)
}
}
// Set default values for monitor alerts
if monitor.AlertAfter == 0 {
minAlertAfter := 1
monitor.AlertAfter = max(defaultAlertAfter, minAlertAfter)
}
if monitor.AlertEvery == nil {
monitor.AlertEvery = defaultAlertEvery
}
if len(monitor.AlertDown) == 0 {
monitor.AlertDown = defaultAlertDown
}
if len(monitor.AlertUp) == 0 {
monitor.AlertUp = defaultAlertUp
}
return nil
}
// Validate checks that the Monitor is properly configured and returns errors if not
func (monitor Monitor) Validate() error {
2022-01-27 00:34:31 +00:00
hasCommand := len(monitor.Command) > 0
hasShellCommand := monitor.ShellCommand != ""
hasValidAlertAfter := monitor.AlertAfter > 0
2022-01-27 00:34:31 +00:00
hasAlertDown := len(monitor.AlertDown) > 0
var err error
2022-01-27 00:34:31 +00:00
hasAtLeastOneCommand := hasCommand || hasShellCommand
if !hasAtLeastOneCommand {
err = errors.Join(err, fmt.Errorf(
"%w: monitor %s has no command or shell_command configured",
ErrInvalidMonitor,
monitor.Name,
))
}
2022-01-27 00:34:31 +00:00
hasAtMostOneCommand := !(hasCommand && hasShellCommand)
if !hasAtMostOneCommand {
err = errors.Join(err, fmt.Errorf(
"%w: monitor %s has both command and shell_command configured",
ErrInvalidMonitor,
monitor.Name,
))
}
if !hasValidAlertAfter {
err = errors.Join(err, fmt.Errorf(
"%w: monitor %s has invalid alert_after value %d. Must be greater than 0",
ErrInvalidMonitor,
monitor.Name,
monitor.AlertAfter,
))
}
if !hasAlertDown {
err = errors.Join(err, fmt.Errorf(
"%w: monitor %s has no alert_down configured. Configure one here or add a default_alert_down",
ErrInvalidMonitor,
monitor.Name,
))
}
2022-01-27 00:34:31 +00:00
return err
2019-09-21 22:03:26 +00:00
}
func (monitor Monitor) LastOutput() string {
return monitor.lastOutput
}
// ShouldCheck returns a boolean indicating if the Monitor is ready to be be checked again
2019-09-21 22:03:26 +00:00
func (monitor Monitor) ShouldCheck() bool {
2022-01-27 00:34:31 +00:00
if monitor.lastCheck.IsZero() || monitor.CheckInterval == 0 {
2019-09-21 22:03:26 +00:00
return true
}
2021-05-11 04:39:52 +00:00
sinceLastCheck := time.Since(monitor.lastCheck)
2021-05-11 04:00:58 +00:00
return sinceLastCheck >= monitor.CheckInterval
2019-09-21 22:03:26 +00:00
}
// Check will run the command configured by the Monitor and return a status and a possible AlertNotice
2019-10-02 16:37:29 +00:00
func (monitor *Monitor) Check() (bool, *AlertNotice) {
2019-09-21 22:03:26 +00:00
var cmd *exec.Cmd
2022-01-27 00:34:31 +00:00
if len(monitor.Command) > 0 {
cmd = exec.Command(monitor.Command[0], monitor.Command[1:]...)
} else if monitor.ShellCommand != "" {
cmd = ShellCommand(monitor.ShellCommand)
2019-09-21 22:03:26 +00:00
} else {
2022-01-27 00:34:31 +00:00
slog.Fatalf("Monitor %s has no command configured", monitor.Name)
2019-09-21 22:03:26 +00:00
}
2021-05-11 17:41:22 +00:00
checkStartTime := time.Now()
2019-09-21 22:03:26 +00:00
output, err := cmd.CombinedOutput()
2019-10-02 16:37:29 +00:00
monitor.lastCheck = time.Now()
monitor.lastOutput = string(output)
2021-05-11 17:41:22 +00:00
monitor.lastCheckDuration = monitor.lastCheck.Sub(checkStartTime)
2019-09-21 22:03:26 +00:00
2019-10-02 16:37:29 +00:00
var alertNotice *AlertNotice
2021-05-11 04:00:58 +00:00
isSuccess := (err == nil)
2019-10-02 16:37:29 +00:00
if isSuccess {
alertNotice = monitor.success()
2019-09-21 22:03:26 +00:00
} else {
2019-10-02 16:37:29 +00:00
alertNotice = monitor.failure()
2019-09-21 22:03:26 +00:00
}
2021-05-11 03:12:18 +00:00
slog.Debugf("Command output: %s", monitor.lastOutput)
slog.OnErrWarnf(err, "Command result: %v", err)
2021-05-11 03:12:18 +00:00
slog.Infof(
"%s success=%t, alert=%t",
2019-10-02 16:37:29 +00:00
monitor.Name,
isSuccess,
alertNotice != nil,
2019-10-02 16:37:29 +00:00
)
return isSuccess, alertNotice
}
// GetAlertNames gives a list of alert names for a given monitor status
func (monitor Monitor) GetAlertNames(up bool) []string {
if up {
return monitor.AlertUp
}
return monitor.AlertDown
}
// IsUp returns the status of the current monitor
func (monitor Monitor) IsUp() bool {
2019-10-02 16:37:29 +00:00
return monitor.alertCount == 0
2019-09-21 22:03:26 +00:00
}
2021-05-11 17:41:22 +00:00
// LastCheckMilliseconds gives number of miliseconds the last check ran for
func (monitor Monitor) LastCheckMilliseconds() int64 {
return monitor.lastCheckDuration.Milliseconds()
}
2019-10-02 16:37:29 +00:00
func (monitor *Monitor) success() (notice *AlertNotice) {
if !monitor.IsUp() {
2019-10-02 16:37:29 +00:00
// Alert that we have recovered
notice = monitor.createAlertNotice(true)
}
2021-05-11 04:00:58 +00:00
2019-10-02 16:37:29 +00:00
monitor.failureCount = 0
monitor.alertCount = 0
monitor.lastSuccess = time.Now()
return
2019-09-21 22:03:26 +00:00
}
2019-10-02 16:37:29 +00:00
func (monitor *Monitor) failure() (notice *AlertNotice) {
monitor.failureCount++
// If we haven't hit the minimum failures, we can exit
if monitor.failureCount < monitor.AlertAfter {
2021-05-11 03:12:18 +00:00
slog.Debugf(
"%s failed but did not hit minimum failures. "+
"Count: %v alert after: %v",
monitor.Name,
monitor.failureCount,
monitor.AlertAfter,
2021-05-11 03:12:18 +00:00
)
2019-10-02 16:37:29 +00:00
return
}
// Take number of failures after minimum
failureCount := (monitor.failureCount - monitor.AlertAfter)
2019-10-02 16:37:29 +00:00
// Use alert cadence to determine if we should alert
2021-05-11 04:00:58 +00:00
switch {
case monitor.AlertEvery == nil, *monitor.AlertEvery == 0:
2019-10-02 16:37:29 +00:00
// Handle alerting on first failure only
if failureCount == 0 {
2019-10-02 16:37:29 +00:00
notice = monitor.createAlertNotice(false)
}
case *monitor.AlertEvery > 0:
// Handle integer number of failures before alerting
if failureCount%*monitor.AlertEvery == 0 {
notice = monitor.createAlertNotice(false)
}
2021-05-11 04:00:58 +00:00
default:
2019-10-02 16:37:29 +00:00
// Handle negative numbers indicating an exponential backoff
2022-01-27 00:34:31 +00:00
if failureCount >= int(math.Pow(2, float64(monitor.alertCount))-1) { //nolint:gomnd
2019-10-02 16:37:29 +00:00
notice = monitor.createAlertNotice(false)
}
}
// If we're going to alert, increment count
2019-10-02 16:37:29 +00:00
if notice != nil {
monitor.alertCount++
}
2021-05-11 04:00:58 +00:00
return notice
2019-10-02 16:37:29 +00:00
}
func (monitor Monitor) createAlertNotice(isUp bool) *AlertNotice {
// TODO: Maybe add something about recovery status here
return &AlertNotice{
MonitorName: monitor.Name,
AlertCount: monitor.alertCount,
FailureCount: monitor.failureCount,
LastCheckOutput: monitor.lastOutput,
LastSuccess: monitor.lastSuccess,
IsUp: isUp,
}
2019-09-21 22:03:26 +00:00
}