Add check runtime metric

duration-intervals
IamTheFij 2 years ago
parent 30c2c7d6b2
commit befea7375f
  1. 4
      main.go
  2. 15
      metrics.go
  3. 18
      monitor.go

@ -74,7 +74,7 @@ func checkMonitors(config *Config) error {
// Track status metrics
Metrics.SetMonitorStatus(monitor.Name, monitor.IsUp())
Metrics.CountCheck(monitor.Name, success, hasAlert)
Metrics.CountCheck(monitor.Name, success, monitor.LastCheckMilliseconds(), hasAlert)
if alertNotice != nil {
return sendAlerts(config, monitor, alertNotice)
@ -108,7 +108,7 @@ func main() {
// Serve metrics exporter, if specified
if ExportMetrics {
slog.Infof("Exporting metrics to Prometheus")
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
go ServeMetrics()
}

@ -19,6 +19,7 @@ import (
type MinitorMetrics struct {
alertCount *prometheus.CounterVec
checkCount *prometheus.CounterVec
checkTime *prometheus.GaugeVec
monitorStatus *prometheus.GaugeVec
}
@ -40,6 +41,13 @@ func NewMetrics() *MinitorMetrics {
},
[]string{"monitor", "status", "is_alert"},
),
checkTime: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "minitor_check_milliseconds",
Help: "Time in miliseconds that a check ran for",
},
[]string{"monitor", "status"},
),
monitorStatus: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "minitor_monitor_up_count",
@ -52,6 +60,7 @@ func NewMetrics() *MinitorMetrics {
// Register newly created metrics
prometheus.MustRegister(metrics.alertCount)
prometheus.MustRegister(metrics.checkCount)
prometheus.MustRegister(metrics.checkTime)
prometheus.MustRegister(metrics.monitorStatus)
return metrics
@ -68,7 +77,7 @@ func (metrics *MinitorMetrics) SetMonitorStatus(monitor string, isUp bool) {
}
// CountCheck counts the result of a particular Monitor check
func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAlert bool) {
func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, ms int64, isAlert bool) {
status := "failure"
if isSuccess {
status = "success"
@ -82,6 +91,10 @@ func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAler
metrics.checkCount.With(
prometheus.Labels{"monitor": monitor, "status": status, "is_alert": alertVal},
).Inc()
metrics.checkTime.With(
prometheus.Labels{"monitor": monitor, "status": status},
).Set(float64(ms))
}
// CountAlert counts an alert

@ -20,11 +20,12 @@ type Monitor struct { //nolint:maligned
Command CommandOrShell
// Other values
alertCount int16
failureCount int16
lastCheck time.Time
lastSuccess time.Time
lastOutput string
alertCount int16
failureCount int16
lastCheck time.Time
lastSuccess time.Time
lastOutput string
lastCheckDuration time.Duration
}
// IsValid returns a boolean indicating if the Monitor has been correctly
@ -57,9 +58,11 @@ func (monitor *Monitor) Check() (bool, *AlertNotice) {
cmd = ShellCommand(monitor.Command.ShellCommand)
}
checkStartTime := time.Now()
output, err := cmd.CombinedOutput()
monitor.lastCheck = time.Now()
monitor.lastOutput = string(output)
monitor.lastCheckDuration = monitor.lastCheck.Sub(checkStartTime)
var alertNotice *AlertNotice
@ -88,6 +91,11 @@ func (monitor Monitor) IsUp() bool {
return monitor.alertCount == 0
}
// LastCheckMilliseconds gives number of miliseconds the last check ran for
func (monitor Monitor) LastCheckMilliseconds() int64 {
return monitor.lastCheckDuration.Milliseconds()
}
func (monitor *Monitor) success() (notice *AlertNotice) {
if !monitor.IsUp() {
// Alert that we have recovered

Loading…
Cancel
Save