From befea7375fc7aceb1ee7ce957e3e901d30c8ec75 Mon Sep 17 00:00:00 2001 From: Ian Fijolek Date: Tue, 11 May 2021 10:41:22 -0700 Subject: [PATCH] Add check runtime metric --- main.go | 4 ++-- metrics.go | 15 ++++++++++++++- monitor.go | 18 +++++++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/main.go b/main.go index e39a7b9..3df6d24 100644 --- a/main.go +++ b/main.go @@ -74,7 +74,7 @@ func checkMonitors(config *Config) error { // Track status metrics Metrics.SetMonitorStatus(monitor.Name, monitor.IsUp()) - Metrics.CountCheck(monitor.Name, success, hasAlert) + Metrics.CountCheck(monitor.Name, success, monitor.LastCheckMilliseconds(), hasAlert) if alertNotice != nil { return sendAlerts(config, monitor, alertNotice) @@ -108,7 +108,7 @@ func main() { // Serve metrics exporter, if specified if ExportMetrics { - slog.Infof("Exporting metrics to Prometheus") + slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort) go ServeMetrics() } diff --git a/metrics.go b/metrics.go index e67f97c..2f7716b 100644 --- a/metrics.go +++ b/metrics.go @@ -19,6 +19,7 @@ import ( type MinitorMetrics struct { alertCount *prometheus.CounterVec checkCount *prometheus.CounterVec + checkTime *prometheus.GaugeVec monitorStatus *prometheus.GaugeVec } @@ -40,6 +41,13 @@ func NewMetrics() *MinitorMetrics { }, []string{"monitor", "status", "is_alert"}, ), + checkTime: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "minitor_check_milliseconds", + Help: "Time in miliseconds that a check ran for", + }, + []string{"monitor", "status"}, + ), monitorStatus: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "minitor_monitor_up_count", @@ -52,6 +60,7 @@ func NewMetrics() *MinitorMetrics { // Register newly created metrics prometheus.MustRegister(metrics.alertCount) prometheus.MustRegister(metrics.checkCount) + prometheus.MustRegister(metrics.checkTime) prometheus.MustRegister(metrics.monitorStatus) return metrics @@ -68,7 +77,7 @@ func (metrics *MinitorMetrics) SetMonitorStatus(monitor string, isUp bool) { } // CountCheck counts the result of a particular Monitor check -func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAlert bool) { +func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, ms int64, isAlert bool) { status := "failure" if isSuccess { status = "success" @@ -82,6 +91,10 @@ func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAler metrics.checkCount.With( prometheus.Labels{"monitor": monitor, "status": status, "is_alert": alertVal}, ).Inc() + + metrics.checkTime.With( + prometheus.Labels{"monitor": monitor, "status": status}, + ).Set(float64(ms)) } // CountAlert counts an alert diff --git a/monitor.go b/monitor.go index d26058e..059d707 100644 --- a/monitor.go +++ b/monitor.go @@ -20,11 +20,12 @@ type Monitor struct { //nolint:maligned Command CommandOrShell // Other values - alertCount int16 - failureCount int16 - lastCheck time.Time - lastSuccess time.Time - lastOutput string + alertCount int16 + failureCount int16 + lastCheck time.Time + lastSuccess time.Time + lastOutput string + lastCheckDuration time.Duration } // IsValid returns a boolean indicating if the Monitor has been correctly @@ -57,9 +58,11 @@ func (monitor *Monitor) Check() (bool, *AlertNotice) { cmd = ShellCommand(monitor.Command.ShellCommand) } + checkStartTime := time.Now() output, err := cmd.CombinedOutput() monitor.lastCheck = time.Now() monitor.lastOutput = string(output) + monitor.lastCheckDuration = monitor.lastCheck.Sub(checkStartTime) var alertNotice *AlertNotice @@ -88,6 +91,11 @@ func (monitor Monitor) IsUp() bool { return monitor.alertCount == 0 } +// LastCheckMilliseconds gives number of miliseconds the last check ran for +func (monitor Monitor) LastCheckMilliseconds() int64 { + return monitor.lastCheckDuration.Milliseconds() +} + func (monitor *Monitor) success() (notice *AlertNotice) { if !monitor.IsUp() { // Alert that we have recovered