2019-09-21 15:03:26 -07:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
2021-05-10 21:00:58 -07:00
|
|
|
"errors"
|
2019-10-04 16:05:25 -07:00
|
|
|
"flag"
|
2019-10-04 15:46:49 -07:00
|
|
|
"fmt"
|
2024-04-03 12:03:17 -07:00
|
|
|
"strings"
|
2019-09-21 15:03:26 -07:00
|
|
|
"time"
|
2021-05-10 20:12:18 -07:00
|
|
|
|
|
|
|
"git.iamthefij.com/iamthefij/slog"
|
2019-09-21 15:03:26 -07:00
|
|
|
)
|
|
|
|
|
2019-10-04 16:17:20 -07:00
|
|
|
var (
|
2019-11-15 11:25:21 -08:00
|
|
|
// ExportMetrics will track whether or not we want to export metrics to prometheus
|
|
|
|
ExportMetrics = false
|
|
|
|
// MetricsPort is the port to expose metrics on
|
|
|
|
MetricsPort = 8080
|
|
|
|
// Metrics contains all active metrics
|
|
|
|
Metrics = NewMetrics()
|
|
|
|
|
2019-10-04 16:17:20 -07:00
|
|
|
// version of minitor being run
|
|
|
|
version = "dev"
|
2021-05-10 21:00:58 -07:00
|
|
|
|
|
|
|
errUnknownAlert = errors.New("unknown alert")
|
2019-10-04 16:17:20 -07:00
|
|
|
)
|
2019-10-04 16:05:25 -07:00
|
|
|
|
2024-11-15 11:30:34 -08:00
|
|
|
func SendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) error {
|
2021-05-10 21:00:58 -07:00
|
|
|
slog.Debugf("Received an alert notice from %s", alertNotice.MonitorName)
|
|
|
|
alertNames := monitor.GetAlertNames(alertNotice.IsUp)
|
|
|
|
|
|
|
|
if alertNames == nil {
|
|
|
|
// This should only happen for a recovery alert. AlertDown is validated not empty
|
|
|
|
slog.Warningf(
|
|
|
|
"Received alert, but no alert mechanisms exist. MonitorName=%s IsUp=%t",
|
|
|
|
alertNotice.MonitorName, alertNotice.IsUp,
|
|
|
|
)
|
2021-09-02 10:19:03 -07:00
|
|
|
|
|
|
|
return nil
|
2021-05-10 21:00:58 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, alertName := range alertNames {
|
2022-01-26 16:34:31 -08:00
|
|
|
if alert, ok := config.GetAlert(alertName); ok {
|
2021-05-10 21:00:58 -07:00
|
|
|
output, err := alert.Send(*alertNotice)
|
|
|
|
if err != nil {
|
|
|
|
slog.Errorf(
|
|
|
|
"Alert '%s' failed. result=%v: output=%s",
|
|
|
|
alert.Name,
|
|
|
|
err,
|
|
|
|
output,
|
|
|
|
)
|
|
|
|
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Count alert metrics
|
|
|
|
Metrics.CountAlert(monitor.Name, alert.Name)
|
|
|
|
} else {
|
|
|
|
// This case should never actually happen since we validate against it
|
|
|
|
slog.Errorf("Unknown alert for monitor %s: %s", alertNotice.MonitorName, alertName)
|
|
|
|
|
|
|
|
return fmt.Errorf("unknown alert for monitor %s: %s: %w", alertNotice.MonitorName, alertName, errUnknownAlert)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2024-11-15 11:30:34 -08:00
|
|
|
func CheckMonitors(config *Config) error {
|
2021-09-02 10:19:03 -07:00
|
|
|
// TODO: Run this in goroutines and capture exceptions
|
2019-10-04 14:47:38 -07:00
|
|
|
for _, monitor := range config.Monitors {
|
|
|
|
if monitor.ShouldCheck() {
|
2019-11-15 11:25:21 -08:00
|
|
|
success, alertNotice := monitor.Check()
|
|
|
|
hasAlert := alertNotice != nil
|
|
|
|
|
|
|
|
// Track status metrics
|
2020-07-14 17:09:56 -07:00
|
|
|
Metrics.SetMonitorStatus(monitor.Name, monitor.IsUp())
|
2021-05-11 10:41:22 -07:00
|
|
|
Metrics.CountCheck(monitor.Name, success, monitor.LastCheckMilliseconds(), hasAlert)
|
2019-10-04 14:47:38 -07:00
|
|
|
|
|
|
|
if alertNotice != nil {
|
2024-11-15 11:30:34 -08:00
|
|
|
err := SendAlerts(config, monitor, alertNotice)
|
2021-09-02 10:20:04 -07:00
|
|
|
// If there was an error in sending an alert, exit early and bubble it up
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2019-10-04 14:47:38 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-10-04 16:17:36 -07:00
|
|
|
|
|
|
|
return nil
|
2019-10-04 14:47:38 -07:00
|
|
|
}
|
|
|
|
|
2024-11-15 11:30:34 -08:00
|
|
|
func SendStartupAlerts(config *Config, alertNames []string) error {
|
2024-04-03 12:03:17 -07:00
|
|
|
for _, alertName := range alertNames {
|
|
|
|
var err error
|
|
|
|
|
2022-01-26 16:34:31 -08:00
|
|
|
alert, ok := config.GetAlert(alertName)
|
2024-04-03 12:03:17 -07:00
|
|
|
if !ok {
|
|
|
|
err = fmt.Errorf("unknown alert %s: %w", alertName, errUnknownAlert)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err == nil {
|
|
|
|
_, err = alert.Send(AlertNotice{
|
|
|
|
AlertCount: 0,
|
|
|
|
FailureCount: 0,
|
|
|
|
IsUp: true,
|
|
|
|
LastSuccess: time.Now(),
|
|
|
|
MonitorName: fmt.Sprintf("First Run Alert Test: %s", alert.Name),
|
|
|
|
LastCheckOutput: "",
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-09-21 15:03:26 -07:00
|
|
|
func main() {
|
2021-05-10 20:12:18 -07:00
|
|
|
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
|
|
|
|
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
|
2024-04-03 12:03:17 -07:00
|
|
|
startupAlerts := flag.String("startup-alerts", "", "List of alerts to run on startup. This can help determine unhealthy alerts early on. (default \"\")")
|
2021-05-10 20:12:18 -07:00
|
|
|
|
|
|
|
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
|
2019-11-15 11:25:21 -08:00
|
|
|
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
|
2021-09-02 10:19:03 -07:00
|
|
|
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
|
2019-10-04 16:05:25 -07:00
|
|
|
flag.Parse()
|
|
|
|
|
2019-10-04 16:17:20 -07:00
|
|
|
// Print version if flag is provided
|
|
|
|
if *showVersion {
|
2021-05-10 20:12:18 -07:00
|
|
|
fmt.Println("Minitor version:", version)
|
|
|
|
|
2019-10-04 16:17:20 -07:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load configuration
|
2020-02-18 00:47:30 +00:00
|
|
|
config, err := LoadConfig(*configPath)
|
2021-05-10 20:12:18 -07:00
|
|
|
slog.OnErrFatalf(err, "Error loading config: %v", err)
|
2019-09-21 15:03:26 -07:00
|
|
|
|
2019-11-15 11:25:21 -08:00
|
|
|
// Serve metrics exporter, if specified
|
|
|
|
if ExportMetrics {
|
2021-05-11 10:41:22 -07:00
|
|
|
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
|
2021-05-10 20:12:18 -07:00
|
|
|
|
2019-11-15 11:25:21 -08:00
|
|
|
go ServeMetrics()
|
|
|
|
}
|
|
|
|
|
2024-04-03 12:03:17 -07:00
|
|
|
if *startupAlerts != "" {
|
|
|
|
alertNames := strings.Split(*startupAlerts, ",")
|
|
|
|
|
2024-11-15 11:30:34 -08:00
|
|
|
err = SendStartupAlerts(&config, alertNames)
|
2024-04-03 12:03:17 -07:00
|
|
|
|
|
|
|
slog.OnErrPanicf(err, "Error running startup alerts")
|
|
|
|
}
|
|
|
|
|
2019-10-04 14:47:38 -07:00
|
|
|
// Start main loop
|
2019-09-21 15:03:26 -07:00
|
|
|
for {
|
2024-11-15 11:30:34 -08:00
|
|
|
err = CheckMonitors(&config)
|
2021-09-02 10:19:03 -07:00
|
|
|
slog.OnErrPanicf(err, "Error checking monitors")
|
2019-09-21 15:03:26 -07:00
|
|
|
|
2023-04-19 15:31:12 -07:00
|
|
|
time.Sleep(config.CheckInterval)
|
2019-09-21 15:03:26 -07:00
|
|
|
}
|
|
|
|
}
|