2019-09-21 22:03:26 +00:00
package main
import (
2021-05-11 04:00:58 +00:00
"errors"
2019-10-04 23:05:25 +00:00
"flag"
2019-10-04 22:46:49 +00:00
"fmt"
2024-04-03 18:23:26 +00:00
"net/http"
2019-09-21 22:03:26 +00:00
"time"
2021-05-11 03:12:18 +00:00
"git.iamthefij.com/iamthefij/slog"
2019-09-21 22:03:26 +00:00
)
2019-10-04 23:17:20 +00:00
var (
2019-11-15 19:25:21 +00:00
// ExportMetrics will track whether or not we want to export metrics to prometheus
ExportMetrics = false
// MetricsPort is the port to expose metrics on
MetricsPort = 8080
// Metrics contains all active metrics
Metrics = NewMetrics ( )
2024-04-03 18:23:26 +00:00
// Self monitor rather than panicing
SelfMonitor = false
// HealthChecks contains health check values
HealthChecks * HealthCheckHandler = nil
2019-11-15 19:25:21 +00:00
2020-02-20 01:31:04 +00:00
// PyCompat enables support for legacy Python templates
PyCompat = false
2019-10-04 23:17:20 +00:00
// version of minitor being run
version = "dev"
2021-05-11 04:00:58 +00:00
errUnknownAlert = errors . New ( "unknown alert" )
2019-10-04 23:17:20 +00:00
)
2019-10-04 23:05:25 +00:00
2021-05-11 04:00:58 +00:00
func sendAlerts ( config * Config , monitor * Monitor , alertNotice * AlertNotice ) error {
slog . Debugf ( "Received an alert notice from %s" , alertNotice . MonitorName )
alertNames := monitor . GetAlertNames ( alertNotice . IsUp )
if alertNames == nil {
// This should only happen for a recovery alert. AlertDown is validated not empty
slog . Warningf (
"Received alert, but no alert mechanisms exist. MonitorName=%s IsUp=%t" ,
alertNotice . MonitorName , alertNotice . IsUp ,
)
2021-09-02 17:19:03 +00:00
return nil
2021-05-11 04:00:58 +00:00
}
for _ , alertName := range alertNames {
if alert , ok := config . Alerts [ alertName ] ; ok {
output , err := alert . Send ( * alertNotice )
if err != nil {
slog . Errorf (
"Alert '%s' failed. result=%v: output=%s" ,
alert . Name ,
err ,
output ,
)
2024-04-03 18:23:26 +00:00
if SelfMonitor {
Metrics . SetMonitorStatus ( fmt . Sprintf ( "Alert %s" , alertName ) , false )
}
2021-05-11 04:00:58 +00:00
return err
2024-04-03 18:23:26 +00:00
} else {
Metrics . SetMonitorStatus ( fmt . Sprintf ( "Alert %s" , alertName ) , true )
2021-05-11 04:00:58 +00:00
}
// Count alert metrics
Metrics . CountAlert ( monitor . Name , alert . Name )
} else {
// This case should never actually happen since we validate against it
slog . Errorf ( "Unknown alert for monitor %s: %s" , alertNotice . MonitorName , alertName )
return fmt . Errorf ( "unknown alert for monitor %s: %s: %w" , alertNotice . MonitorName , alertName , errUnknownAlert )
}
}
return nil
}
2019-10-04 23:17:36 +00:00
func checkMonitors ( config * Config ) error {
2021-09-02 17:19:03 +00:00
// TODO: Run this in goroutines and capture exceptions
2024-04-03 18:23:26 +00:00
healthy := true
2019-10-04 21:47:38 +00:00
for _ , monitor := range config . Monitors {
if monitor . ShouldCheck ( ) {
2019-11-15 19:25:21 +00:00
success , alertNotice := monitor . Check ( )
hasAlert := alertNotice != nil
// Track status metrics
2020-07-15 00:09:56 +00:00
Metrics . SetMonitorStatus ( monitor . Name , monitor . IsUp ( ) )
2021-05-11 17:41:22 +00:00
Metrics . CountCheck ( monitor . Name , success , monitor . LastCheckMilliseconds ( ) , hasAlert )
2019-10-04 21:47:38 +00:00
if alertNotice != nil {
2021-09-02 17:20:04 +00:00
err := sendAlerts ( config , monitor , alertNotice )
2024-04-03 18:23:26 +00:00
// If there was an error in sending an alert, mark as unhealthy or bubble up
2021-09-02 17:20:04 +00:00
if err != nil {
2024-04-03 18:23:26 +00:00
if SelfMonitor {
healthy = false
} else {
return err
}
2021-09-02 17:20:04 +00:00
}
2019-10-04 21:47:38 +00:00
}
}
}
2019-10-04 23:17:36 +00:00
2024-04-03 18:23:26 +00:00
if HealthChecks != nil {
HealthChecks . MinitorHealthy ( healthy )
}
2019-10-04 23:17:36 +00:00
return nil
2019-10-04 21:47:38 +00:00
}
2024-04-03 18:23:26 +00:00
// ServeMetricsAndHealth starts the default http server
func ServeMetricsAndHealth ( ) {
host := fmt . Sprintf ( ":%d" , MetricsPort )
_ = http . ListenAndServe ( host , nil )
}
2019-09-21 22:03:26 +00:00
func main ( ) {
2021-05-11 03:12:18 +00:00
showVersion := flag . Bool ( "version" , false , "Display the version of minitor and exit" )
configPath := flag . String ( "config" , "config.yml" , "Alternate configuration path (default: config.yml)" )
flag . BoolVar ( & slog . DebugLevel , "debug" , false , "Enables debug logs (default: false)" )
2019-11-15 19:25:21 +00:00
flag . BoolVar ( & ExportMetrics , "metrics" , false , "Enables prometheus metrics exporting (default: false)" )
2020-02-20 01:31:04 +00:00
flag . BoolVar ( & PyCompat , "py-compat" , false , "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)" )
2024-04-03 18:23:26 +00:00
flag . IntVar ( & MetricsPort , "metrics-port" , MetricsPort , "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)" )
flag . BoolVar ( & SelfMonitor , "self-monitor" , false , "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)" )
2019-10-04 23:05:25 +00:00
flag . Parse ( )
2019-10-04 23:17:20 +00:00
// Print version if flag is provided
if * showVersion {
2021-05-11 03:12:18 +00:00
fmt . Println ( "Minitor version:" , version )
2019-10-04 23:17:20 +00:00
return
}
// Load configuration
2020-02-18 00:47:30 +00:00
config , err := LoadConfig ( * configPath )
2021-05-11 03:12:18 +00:00
slog . OnErrFatalf ( err , "Error loading config: %v" , err )
2019-09-21 22:03:26 +00:00
2019-11-15 19:25:21 +00:00
// Serve metrics exporter, if specified
if ExportMetrics {
2021-05-11 17:41:22 +00:00
slog . Infof ( "Exporting metrics to Prometheus on port %d" , MetricsPort )
2024-04-03 18:23:26 +00:00
HandleMetrics ( )
}
if SelfMonitor {
slog . Infof ( "Starting healthcheck endpoint on port %d" , MetricsPort )
HealthChecks = NewHealthCheckHandler ( config . Monitors )
HandleHealthCheck ( )
}
2021-05-11 03:12:18 +00:00
2024-04-03 18:23:26 +00:00
if ExportMetrics || SelfMonitor {
go ServeMetricsAndHealth ( )
2019-11-15 19:25:21 +00:00
}
2019-10-04 21:47:38 +00:00
// Start main loop
2019-09-21 22:03:26 +00:00
for {
2019-10-04 23:17:36 +00:00
err = checkMonitors ( & config )
2021-09-02 17:19:03 +00:00
slog . OnErrPanicf ( err , "Error checking monitors" )
2019-09-21 22:03:26 +00:00
2021-05-12 17:33:42 +00:00
time . Sleep ( config . CheckInterval . Value ( ) )
2019-09-21 22:03:26 +00:00
}
}