From 0a36da79d68bcfa5dc62e9fcea04b69ea40f189e Mon Sep 17 00:00:00 2001 From: Ian Fijolek Date: Wed, 3 Apr 2024 11:23:26 -0700 Subject: [PATCH] Add health check and self reporting of health This avoids panicing and instead provides an HTTP endpoint to report health --- health.go | 72 +++++++++++++++++++++++++++++++++++++++++++++ health_test.go | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ main.go | 49 ++++++++++++++++++++++++++++--- main_test.go | 80 +++++++++++++++++++++++++++++++++++++++----------- metrics.go | 9 ++---- 5 files changed, 261 insertions(+), 28 deletions(-) create mode 100644 health.go create mode 100644 health_test.go diff --git a/health.go b/health.go new file mode 100644 index 0000000..40b23d0 --- /dev/null +++ b/health.go @@ -0,0 +1,72 @@ +package main + +import ( + "fmt" + "io" + "net/http" + "strings" +) + +type HealthCheckHandler struct { + isMinitorHealthy bool + monitors []*Monitor +} + +func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler { + return &HealthCheckHandler{ + false, + monitors, + } +} + +func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) { + hch.isMinitorHealthy = healthy +} + +func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) { + if hch.isMinitorHealthy { + return true, "OK" + } else { + return false, "UNHEALTHY" + } +} + +func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) { + downMonitors := []string{} + + for _, monitor := range hch.monitors { + if !monitor.IsUp() { + downMonitors = append(downMonitors, monitor.Name) + } + } + + if len(downMonitors) == 0 { + return true, "OK" + } else { + return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", ")) + } +} + +func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + var healthy bool + + var body string + + if monitors := r.URL.Query().Get("monitors"); monitors != "" { + healthy, body = hch.MonitorsHealthCheck() + } else { + healthy, body = hch.MinitorHealthCheck() + } + + if healthy { + w.WriteHeader(http.StatusOK) + } else { + w.WriteHeader(http.StatusServiceUnavailable) + } + + _, _ = io.WriteString(w, body) +} + +func HandleHealthCheck() { + http.Handle("/metrics", HealthChecks) +} diff --git a/health_test.go b/health_test.go new file mode 100644 index 0000000..a83d18a --- /dev/null +++ b/health_test.go @@ -0,0 +1,79 @@ +package main + +import ( + "testing" +) + +func TestNewHealthCheck(t *testing.T) { + monitors := []*Monitor{ + {Name: "Test Monitor"}, + } + hc := NewHealthCheckHandler(monitors) + + monitors[0].alertCount++ + + if healthy, _ := hc.MinitorHealthCheck(); healthy { + t.Errorf("Initial hc state should be unhealthy until some successful alert is sent") + } + + if healthy, _ := hc.MonitorsHealthCheck(); healthy { + t.Errorf("Faking an alert on the monitor pointer should make this unhealthy") + } +} + +func TestMinitorHealthCheck(t *testing.T) { + monitors := []*Monitor{ + {Name: "Test Monitor"}, + } + hc := NewHealthCheckHandler(monitors) + + t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) { + hc.MinitorHealthy(true) + healthy, body := hc.MinitorHealthCheck() + if !healthy { + t.Errorf("Expected healthy check") + } + if body != "OK" { + t.Errorf("Expected OK response") + } + }) + + t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) { + hc.MinitorHealthy(false) + healthy, body := hc.MinitorHealthCheck() + if healthy { + t.Errorf("Expected healthy check") + } + if body != "UNHEALTHY" { + t.Errorf("Expected UNHEALTHY response") + } + }) +} + +func TestMonitorsHealthCheck(t *testing.T) { + monitors := []*Monitor{ + {Name: "Test Monitor"}, + } + hc := NewHealthCheckHandler(monitors) + + t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) { + healthy, body := hc.MonitorsHealthCheck() + if !healthy { + t.Errorf("Expected healthy check") + } + if body != "OK" { + t.Errorf("Expected OK response") + } + }) + + t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) { + monitors[0].alertCount++ + healthy, body := hc.MonitorsHealthCheck() + if healthy { + t.Errorf("Expected healthy check") + } + if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" { + t.Errorf("Expected UNHEALTHY response") + } + }) +} diff --git a/main.go b/main.go index 97249e2..d1a430f 100644 --- a/main.go +++ b/main.go @@ -4,6 +4,7 @@ import ( "errors" "flag" "fmt" + "net/http" "time" "git.iamthefij.com/iamthefij/slog" @@ -16,6 +17,10 @@ var ( MetricsPort = 8080 // Metrics contains all active metrics Metrics = NewMetrics() + // Self monitor rather than panicing + SelfMonitor = false + // HealthChecks contains health check values + HealthChecks *HealthCheckHandler = nil // PyCompat enables support for legacy Python templates PyCompat = false @@ -51,7 +56,13 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro output, ) + if SelfMonitor { + Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false) + } + return err + } else { + Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true) } // Count alert metrics @@ -69,6 +80,8 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro func checkMonitors(config *Config) error { // TODO: Run this in goroutines and capture exceptions + healthy := true + for _, monitor := range config.Monitors { if monitor.ShouldCheck() { success, alertNotice := monitor.Check() @@ -80,17 +93,32 @@ func checkMonitors(config *Config) error { if alertNotice != nil { err := sendAlerts(config, monitor, alertNotice) - // If there was an error in sending an alert, exit early and bubble it up + // If there was an error in sending an alert, mark as unhealthy or bubble up if err != nil { - return err + if SelfMonitor { + healthy = false + } else { + return err + } } } } } + if HealthChecks != nil { + HealthChecks.MinitorHealthy(healthy) + } + return nil } +// ServeMetricsAndHealth starts the default http server +func ServeMetricsAndHealth() { + host := fmt.Sprintf(":%d", MetricsPort) + + _ = http.ListenAndServe(host, nil) +} + func main() { showVersion := flag.Bool("version", false, "Display the version of minitor and exit") configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)") @@ -98,7 +126,9 @@ func main() { flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)") flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)") flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)") - flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)") + flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)") + flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)") + flag.Parse() // Print version if flag is provided @@ -115,8 +145,19 @@ func main() { // Serve metrics exporter, if specified if ExportMetrics { slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort) + HandleMetrics() + } - go ServeMetrics() + if SelfMonitor { + slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort) + + HealthChecks = NewHealthCheckHandler(config.Monitors) + + HandleHealthCheck() + } + + if ExportMetrics || SelfMonitor { + go ServeMetricsAndHealth() } // Start main loop diff --git a/main_test.go b/main_test.go index 9bc11d7..549492f 100644 --- a/main_test.go +++ b/main_test.go @@ -4,9 +4,10 @@ import "testing" func TestCheckMonitors(t *testing.T) { cases := []struct { - config Config - expectErr bool - name string + config Config + expectErr bool + name string + selfMonitor bool }{ { config: Config{}, @@ -22,8 +23,9 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: false, - name: "Monitor success, no alerts", + expectErr: false, + name: "Monitor success, no alerts", + selfMonitor: false, }, { config: Config{ @@ -35,8 +37,9 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: false, - name: "Monitor failure, no alerts", + expectErr: false, + name: "Monitor failure, no alerts", + selfMonitor: false, }, { config: Config{ @@ -48,8 +51,9 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: false, - name: "Monitor recovery, no alerts", + expectErr: false, + name: "Monitor recovery, no alerts", + selfMonitor: false, }, { config: Config{ @@ -62,8 +66,9 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: true, - name: "Monitor failure, unknown alerts", + expectErr: true, + name: "Monitor failure, unknown alerts", + selfMonitor: false, }, { config: Config{ @@ -76,8 +81,24 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: true, - name: "Monitor recovery, unknown alerts", + expectErr: true, + name: "Monitor recovery, unknown alerts", + selfMonitor: false, + }, + { + config: Config{ + Monitors: []*Monitor{ + { + Name: "Success", + Command: CommandOrShell{Command: []string{"true"}}, + AlertUp: []string{"unknown"}, + alertCount: 1, + }, + }, + }, + expectErr: false, + name: "Monitor recovery, unknown alerts, with Health Check", + selfMonitor: true, }, { config: Config{ @@ -95,8 +116,9 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: false, - name: "Monitor failure, successful alert", + expectErr: false, + name: "Monitor failure, successful alert", + selfMonitor: false, }, { config: Config{ @@ -115,12 +137,36 @@ func TestCheckMonitors(t *testing.T) { }, }, }, - expectErr: true, - name: "Monitor failure, bad alert", + expectErr: true, + name: "Monitor failure, bad alert", + selfMonitor: false, + }, + { + config: Config{ + Monitors: []*Monitor{ + { + Name: "Failure", + Command: CommandOrShell{Command: []string{"false"}}, + AlertDown: []string{"bad"}, + AlertAfter: 1, + }, + }, + Alerts: map[string]*Alert{ + "bad": { + Name: "bad", + Command: CommandOrShell{Command: []string{"false"}}, + }, + }, + }, + expectErr: false, + name: "Monitor failure, bad alert, with Health Check", + selfMonitor: true, }, } for _, c := range cases { + SelfMonitor = c.selfMonitor + err := c.config.Init() if err != nil { t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err) diff --git a/metrics.go b/metrics.go index 2f7716b..7065a52 100644 --- a/metrics.go +++ b/metrics.go @@ -1,7 +1,6 @@ package main import ( - "fmt" "net/http" "github.com/prometheus/client_golang/prometheus" @@ -107,11 +106,7 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) { ).Inc() } -// ServeMetrics starts an http server with a Prometheus metrics handler -func ServeMetrics() { +// HandleMetrics add Prometheus metrics handler to default http server +func HandleMetrics() { http.Handle("/metrics", promhttp.Handler()) - - host := fmt.Sprintf(":%d", MetricsPort) - - _ = http.ListenAndServe(host, nil) }