Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
0a36da79d6 |
72
health.go
Normal file
72
health.go
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type HealthCheckHandler struct {
|
||||||
|
isMinitorHealthy bool
|
||||||
|
monitors []*Monitor
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
|
||||||
|
return &HealthCheckHandler{
|
||||||
|
false,
|
||||||
|
monitors,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
|
||||||
|
hch.isMinitorHealthy = healthy
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
|
||||||
|
if hch.isMinitorHealthy {
|
||||||
|
return true, "OK"
|
||||||
|
} else {
|
||||||
|
return false, "UNHEALTHY"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
|
||||||
|
downMonitors := []string{}
|
||||||
|
|
||||||
|
for _, monitor := range hch.monitors {
|
||||||
|
if !monitor.IsUp() {
|
||||||
|
downMonitors = append(downMonitors, monitor.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(downMonitors) == 0 {
|
||||||
|
return true, "OK"
|
||||||
|
} else {
|
||||||
|
return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var healthy bool
|
||||||
|
|
||||||
|
var body string
|
||||||
|
|
||||||
|
if monitors := r.URL.Query().Get("monitors"); monitors != "" {
|
||||||
|
healthy, body = hch.MonitorsHealthCheck()
|
||||||
|
} else {
|
||||||
|
healthy, body = hch.MinitorHealthCheck()
|
||||||
|
}
|
||||||
|
|
||||||
|
if healthy {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
} else {
|
||||||
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _ = io.WriteString(w, body)
|
||||||
|
}
|
||||||
|
|
||||||
|
func HandleHealthCheck() {
|
||||||
|
http.Handle("/metrics", HealthChecks)
|
||||||
|
}
|
79
health_test.go
Normal file
79
health_test.go
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNewHealthCheck(t *testing.T) {
|
||||||
|
monitors := []*Monitor{
|
||||||
|
{Name: "Test Monitor"},
|
||||||
|
}
|
||||||
|
hc := NewHealthCheckHandler(monitors)
|
||||||
|
|
||||||
|
monitors[0].alertCount++
|
||||||
|
|
||||||
|
if healthy, _ := hc.MinitorHealthCheck(); healthy {
|
||||||
|
t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
|
||||||
|
}
|
||||||
|
|
||||||
|
if healthy, _ := hc.MonitorsHealthCheck(); healthy {
|
||||||
|
t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMinitorHealthCheck(t *testing.T) {
|
||||||
|
monitors := []*Monitor{
|
||||||
|
{Name: "Test Monitor"},
|
||||||
|
}
|
||||||
|
hc := NewHealthCheckHandler(monitors)
|
||||||
|
|
||||||
|
t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
|
||||||
|
hc.MinitorHealthy(true)
|
||||||
|
healthy, body := hc.MinitorHealthCheck()
|
||||||
|
if !healthy {
|
||||||
|
t.Errorf("Expected healthy check")
|
||||||
|
}
|
||||||
|
if body != "OK" {
|
||||||
|
t.Errorf("Expected OK response")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
|
||||||
|
hc.MinitorHealthy(false)
|
||||||
|
healthy, body := hc.MinitorHealthCheck()
|
||||||
|
if healthy {
|
||||||
|
t.Errorf("Expected healthy check")
|
||||||
|
}
|
||||||
|
if body != "UNHEALTHY" {
|
||||||
|
t.Errorf("Expected UNHEALTHY response")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMonitorsHealthCheck(t *testing.T) {
|
||||||
|
monitors := []*Monitor{
|
||||||
|
{Name: "Test Monitor"},
|
||||||
|
}
|
||||||
|
hc := NewHealthCheckHandler(monitors)
|
||||||
|
|
||||||
|
t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
|
||||||
|
healthy, body := hc.MonitorsHealthCheck()
|
||||||
|
if !healthy {
|
||||||
|
t.Errorf("Expected healthy check")
|
||||||
|
}
|
||||||
|
if body != "OK" {
|
||||||
|
t.Errorf("Expected OK response")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
|
||||||
|
monitors[0].alertCount++
|
||||||
|
healthy, body := hc.MonitorsHealthCheck()
|
||||||
|
if healthy {
|
||||||
|
t.Errorf("Expected healthy check")
|
||||||
|
}
|
||||||
|
if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
|
||||||
|
t.Errorf("Expected UNHEALTHY response")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
49
main.go
49
main.go
@ -4,6 +4,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net/http"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"git.iamthefij.com/iamthefij/slog"
|
"git.iamthefij.com/iamthefij/slog"
|
||||||
@ -16,6 +17,10 @@ var (
|
|||||||
MetricsPort = 8080
|
MetricsPort = 8080
|
||||||
// Metrics contains all active metrics
|
// Metrics contains all active metrics
|
||||||
Metrics = NewMetrics()
|
Metrics = NewMetrics()
|
||||||
|
// Self monitor rather than panicing
|
||||||
|
SelfMonitor = false
|
||||||
|
// HealthChecks contains health check values
|
||||||
|
HealthChecks *HealthCheckHandler = nil
|
||||||
|
|
||||||
// PyCompat enables support for legacy Python templates
|
// PyCompat enables support for legacy Python templates
|
||||||
PyCompat = false
|
PyCompat = false
|
||||||
@ -51,7 +56,13 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
|
|||||||
output,
|
output,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if SelfMonitor {
|
||||||
|
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
|
||||||
|
}
|
||||||
|
|
||||||
return err
|
return err
|
||||||
|
} else {
|
||||||
|
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count alert metrics
|
// Count alert metrics
|
||||||
@ -69,6 +80,8 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
|
|||||||
|
|
||||||
func checkMonitors(config *Config) error {
|
func checkMonitors(config *Config) error {
|
||||||
// TODO: Run this in goroutines and capture exceptions
|
// TODO: Run this in goroutines and capture exceptions
|
||||||
|
healthy := true
|
||||||
|
|
||||||
for _, monitor := range config.Monitors {
|
for _, monitor := range config.Monitors {
|
||||||
if monitor.ShouldCheck() {
|
if monitor.ShouldCheck() {
|
||||||
success, alertNotice := monitor.Check()
|
success, alertNotice := monitor.Check()
|
||||||
@ -80,17 +93,32 @@ func checkMonitors(config *Config) error {
|
|||||||
|
|
||||||
if alertNotice != nil {
|
if alertNotice != nil {
|
||||||
err := sendAlerts(config, monitor, alertNotice)
|
err := sendAlerts(config, monitor, alertNotice)
|
||||||
// If there was an error in sending an alert, exit early and bubble it up
|
// If there was an error in sending an alert, mark as unhealthy or bubble up
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
if SelfMonitor {
|
||||||
|
healthy = false
|
||||||
|
} else {
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if HealthChecks != nil {
|
||||||
|
HealthChecks.MinitorHealthy(healthy)
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ServeMetricsAndHealth starts the default http server
|
||||||
|
func ServeMetricsAndHealth() {
|
||||||
|
host := fmt.Sprintf(":%d", MetricsPort)
|
||||||
|
|
||||||
|
_ = http.ListenAndServe(host, nil)
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
|
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
|
||||||
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
|
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
|
||||||
@ -98,7 +126,9 @@ func main() {
|
|||||||
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
|
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
|
||||||
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
|
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
|
||||||
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
|
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
|
||||||
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
|
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
|
||||||
|
flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
// Print version if flag is provided
|
// Print version if flag is provided
|
||||||
@ -115,8 +145,19 @@ func main() {
|
|||||||
// Serve metrics exporter, if specified
|
// Serve metrics exporter, if specified
|
||||||
if ExportMetrics {
|
if ExportMetrics {
|
||||||
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
|
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
|
||||||
|
HandleMetrics()
|
||||||
|
}
|
||||||
|
|
||||||
go ServeMetrics()
|
if SelfMonitor {
|
||||||
|
slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
|
||||||
|
|
||||||
|
HealthChecks = NewHealthCheckHandler(config.Monitors)
|
||||||
|
|
||||||
|
HandleHealthCheck()
|
||||||
|
}
|
||||||
|
|
||||||
|
if ExportMetrics || SelfMonitor {
|
||||||
|
go ServeMetricsAndHealth()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start main loop
|
// Start main loop
|
||||||
|
80
main_test.go
80
main_test.go
@ -4,9 +4,10 @@ import "testing"
|
|||||||
|
|
||||||
func TestCheckMonitors(t *testing.T) {
|
func TestCheckMonitors(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
config Config
|
config Config
|
||||||
expectErr bool
|
expectErr bool
|
||||||
name string
|
name string
|
||||||
|
selfMonitor bool
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
config: Config{},
|
config: Config{},
|
||||||
@ -22,8 +23,9 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor success, no alerts",
|
name: "Monitor success, no alerts",
|
||||||
|
selfMonitor: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@ -35,8 +37,9 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor failure, no alerts",
|
name: "Monitor failure, no alerts",
|
||||||
|
selfMonitor: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@ -48,8 +51,9 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor recovery, no alerts",
|
name: "Monitor recovery, no alerts",
|
||||||
|
selfMonitor: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@ -62,8 +66,9 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
name: "Monitor failure, unknown alerts",
|
name: "Monitor failure, unknown alerts",
|
||||||
|
selfMonitor: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@ -76,8 +81,24 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
name: "Monitor recovery, unknown alerts",
|
name: "Monitor recovery, unknown alerts",
|
||||||
|
selfMonitor: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
config: Config{
|
||||||
|
Monitors: []*Monitor{
|
||||||
|
{
|
||||||
|
Name: "Success",
|
||||||
|
Command: CommandOrShell{Command: []string{"true"}},
|
||||||
|
AlertUp: []string{"unknown"},
|
||||||
|
alertCount: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectErr: false,
|
||||||
|
name: "Monitor recovery, unknown alerts, with Health Check",
|
||||||
|
selfMonitor: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@ -95,8 +116,9 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor failure, successful alert",
|
name: "Monitor failure, successful alert",
|
||||||
|
selfMonitor: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@ -115,12 +137,36 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
name: "Monitor failure, bad alert",
|
name: "Monitor failure, bad alert",
|
||||||
|
selfMonitor: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
config: Config{
|
||||||
|
Monitors: []*Monitor{
|
||||||
|
{
|
||||||
|
Name: "Failure",
|
||||||
|
Command: CommandOrShell{Command: []string{"false"}},
|
||||||
|
AlertDown: []string{"bad"},
|
||||||
|
AlertAfter: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Alerts: map[string]*Alert{
|
||||||
|
"bad": {
|
||||||
|
Name: "bad",
|
||||||
|
Command: CommandOrShell{Command: []string{"false"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectErr: false,
|
||||||
|
name: "Monitor failure, bad alert, with Health Check",
|
||||||
|
selfMonitor: true,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
|
SelfMonitor = c.selfMonitor
|
||||||
|
|
||||||
err := c.config.Init()
|
err := c.config.Init()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
|
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
@ -107,11 +106,7 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
|
|||||||
).Inc()
|
).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
// ServeMetrics starts an http server with a Prometheus metrics handler
|
// HandleMetrics add Prometheus metrics handler to default http server
|
||||||
func ServeMetrics() {
|
func HandleMetrics() {
|
||||||
http.Handle("/metrics", promhttp.Handler())
|
http.Handle("/metrics", promhttp.Handler())
|
||||||
|
|
||||||
host := fmt.Sprintf(":%d", MetricsPort)
|
|
||||||
|
|
||||||
_ = http.ListenAndServe(host, nil)
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user