Add alerting for recovered monitors
Based on the idea from SeaLife, adds alerts for when a monitor comes out of an alerting down state. Also includes a bunch of unit tests to cover the new code.
This commit is contained in:
parent
4fe8020a77
commit
aad28976e2
@ -98,7 +98,9 @@ class InvalidMonitorException(Exception):
|
|||||||
|
|
||||||
|
|
||||||
class MinitorAlert(Exception):
|
class MinitorAlert(Exception):
|
||||||
pass
|
def __init__(self, message, monitor):
|
||||||
|
super().__init__(message)
|
||||||
|
self.monitor = monitor
|
||||||
|
|
||||||
|
|
||||||
class Monitor(object):
|
class Monitor(object):
|
||||||
@ -117,7 +119,10 @@ class Monitor(object):
|
|||||||
|
|
||||||
self.name = settings['name']
|
self.name = settings['name']
|
||||||
self.command = settings['command']
|
self.command = settings['command']
|
||||||
self.alerts = settings.get('alerts', [])
|
self.alert_down = settings.get('alert_down', [])
|
||||||
|
if not self.alert_down:
|
||||||
|
self.alert_down = settings.get('alerts', [])
|
||||||
|
self.alert_up = settings.get('alert_up', [])
|
||||||
self.check_interval = settings.get('check_interval')
|
self.check_interval = settings.get('check_interval')
|
||||||
self.alert_after = settings.get('alert_after')
|
self.alert_after = settings.get('alert_after')
|
||||||
self.alert_every = settings.get('alert_every')
|
self.alert_every = settings.get('alert_every')
|
||||||
@ -162,9 +167,17 @@ class Monitor(object):
|
|||||||
|
|
||||||
def success(self):
|
def success(self):
|
||||||
"""Handles success tasks"""
|
"""Handles success tasks"""
|
||||||
|
back_up = None
|
||||||
|
if not self.is_up():
|
||||||
|
back_up = MinitorAlert(
|
||||||
|
'{} check is up again!'.format(self.name),
|
||||||
|
self,
|
||||||
|
)
|
||||||
self.total_failure_count = 0
|
self.total_failure_count = 0
|
||||||
self.alert_count = 0
|
self.alert_count = 0
|
||||||
self.last_success = datetime.now()
|
self.last_success = datetime.now()
|
||||||
|
if back_up:
|
||||||
|
raise back_up
|
||||||
|
|
||||||
def failure(self):
|
def failure(self):
|
||||||
"""Handles failure tasks and possibly raises MinitorAlert"""
|
"""Handles failure tasks and possibly raises MinitorAlert"""
|
||||||
@ -182,9 +195,16 @@ class Monitor(object):
|
|||||||
|
|
||||||
if should_alert:
|
if should_alert:
|
||||||
self.alert_count += 1
|
self.alert_count += 1
|
||||||
raise MinitorAlert('{} check has failed {} times'.format(
|
raise MinitorAlert(
|
||||||
self.name, self.total_failure_count
|
'{} check has failed {} times'.format(
|
||||||
))
|
self.name, self.total_failure_count
|
||||||
|
),
|
||||||
|
self
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_up(self):
|
||||||
|
"""Indicates if the monitor is already alerting failures"""
|
||||||
|
return self.alert_count == 0
|
||||||
|
|
||||||
|
|
||||||
class Alert(object):
|
class Alert(object):
|
||||||
@ -214,11 +234,12 @@ class Alert(object):
|
|||||||
return 'Never'
|
return 'Never'
|
||||||
return dt.isoformat()
|
return dt.isoformat()
|
||||||
|
|
||||||
def alert(self, monitor):
|
def alert(self, message, monitor):
|
||||||
"""Calls the alert command for the provided monitor"""
|
"""Calls the alert command for the provided monitor"""
|
||||||
output, ex = call_output(
|
output, ex = call_output(
|
||||||
self._formated_command(
|
self._formated_command(
|
||||||
alert_count=monitor.alert_count,
|
alert_count=monitor.alert_count,
|
||||||
|
alert_message=message,
|
||||||
monitor_name=monitor.name,
|
monitor_name=monitor.name,
|
||||||
failure_count=monitor.total_failure_count,
|
failure_count=monitor.total_failure_count,
|
||||||
last_success=self._format_datetime(monitor.last_success),
|
last_success=self._format_datetime(monitor.last_success),
|
||||||
@ -248,7 +269,7 @@ class Minitor(object):
|
|||||||
self.alerts = {
|
self.alerts = {
|
||||||
'log': Alert(
|
'log': Alert(
|
||||||
'log',
|
'log',
|
||||||
{'command': ['echo', '{monitor_name} has failed!']}
|
{'command': ['echo', '{alert_message}!']}
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
self.alerts.update({
|
self.alerts.update({
|
||||||
@ -275,10 +296,12 @@ class Minitor(object):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def alert_for_monitor(self, monitor):
|
def handle_minitor_alert(self, minitor_alert):
|
||||||
"""Issues all alerts for a provided monitor"""
|
"""Issues all alerts for a provided monitor"""
|
||||||
for alert in monitor.alerts:
|
monitor = minitor_alert.monitor
|
||||||
self.alerts[alert].alert(monitor)
|
alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down
|
||||||
|
for alert in alerts:
|
||||||
|
self.alerts[alert].alert(str(minitor_alert), monitor)
|
||||||
|
|
||||||
def parse_args(self):
|
def parse_args(self):
|
||||||
"""Parses command line arguments and returns them"""
|
"""Parses command line arguments and returns them"""
|
||||||
@ -309,7 +332,7 @@ class Minitor(object):
|
|||||||
)
|
)
|
||||||
except MinitorAlert as minitor_alert:
|
except MinitorAlert as minitor_alert:
|
||||||
self.logger.warn(minitor_alert)
|
self.logger.warn(minitor_alert)
|
||||||
self.alert_for_monitor(monitor)
|
self.handle_minitor_alert(minitor_alert)
|
||||||
|
|
||||||
sleep(self.check_interval)
|
sleep(self.check_interval)
|
||||||
|
|
||||||
|
@ -32,20 +32,26 @@ class TestAlert(object):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'last_success',
|
'last_success,expected_success',
|
||||||
[
|
[
|
||||||
(None, 'Never'),
|
(None, 'Never'),
|
||||||
(datetime(2018, 4, 10), '2018-04-10T00:00:00')
|
(datetime(2018, 4, 10), '2018-04-10T00:00:00')
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_simple_alert(self, monitor, echo_alert, last_success):
|
def test_simple_alert(
|
||||||
|
self,
|
||||||
|
monitor,
|
||||||
|
echo_alert,
|
||||||
|
last_success,
|
||||||
|
expected_success
|
||||||
|
):
|
||||||
monitor.total_failure_count = 1
|
monitor.total_failure_count = 1
|
||||||
monitor.alert_count = 1
|
monitor.alert_count = 1
|
||||||
monitor.last_success = last_success[0]
|
monitor.last_success = last_success
|
||||||
with patch.object(echo_alert.logger, 'error') as mock_error:
|
with patch.object(echo_alert.logger, 'error') as mock_error:
|
||||||
echo_alert.alert(monitor)
|
echo_alert.alert('Exception message', monitor)
|
||||||
mock_error.assert_called_once_with(
|
mock_error.assert_called_once_with(
|
||||||
'Dummy Monitor has failed 1 time(s)!\n'
|
'Dummy Monitor has failed 1 time(s)!\n'
|
||||||
'We have alerted 1 time(s)\n'
|
'We have alerted 1 time(s)\n'
|
||||||
'Last success was ' + last_success[1]
|
'Last success was ' + expected_success
|
||||||
)
|
)
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from minitor.main import InvalidMonitorException
|
from minitor.main import InvalidMonitorException
|
||||||
@ -13,6 +16,11 @@ class TestMonitor(object):
|
|||||||
return Monitor({
|
return Monitor({
|
||||||
'name': 'Sample Monitor',
|
'name': 'Sample Monitor',
|
||||||
'command': ['echo', 'foo'],
|
'command': ['echo', 'foo'],
|
||||||
|
'alert_down': ['log'],
|
||||||
|
'alert_up': ['log'],
|
||||||
|
'check_interval': 1,
|
||||||
|
'alert_after': 1,
|
||||||
|
'alert_every': 1,
|
||||||
})
|
})
|
||||||
|
|
||||||
@pytest.mark.parametrize('settings', [
|
@pytest.mark.parametrize('settings', [
|
||||||
@ -92,3 +100,42 @@ class TestMonitor(object):
|
|||||||
monitor.failure()
|
monitor.failure()
|
||||||
else:
|
else:
|
||||||
monitor.failure()
|
monitor.failure()
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('last_check', [None, datetime(2018, 4, 10)])
|
||||||
|
def test_monitor_should_check(self, monitor, last_check):
|
||||||
|
monitor.last_check = last_check
|
||||||
|
assert monitor.should_check()
|
||||||
|
|
||||||
|
def test_monitor_check_fail(self, monitor):
|
||||||
|
with patch.object(monitor, 'failure') as mock_failure:
|
||||||
|
monitor.command = ['ls', '--not-real']
|
||||||
|
assert not monitor.check()
|
||||||
|
mock_failure.assert_called_once()
|
||||||
|
|
||||||
|
def test_monitor_check_success(self, monitor):
|
||||||
|
with patch.object(monitor, 'success') as mock_success:
|
||||||
|
assert monitor.check()
|
||||||
|
mock_success.assert_called_once()
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('failure_count', [0, 1])
|
||||||
|
def test_monitor_success(self, monitor, failure_count):
|
||||||
|
monitor.alert_count = 0
|
||||||
|
monitor.total_failure_count = failure_count
|
||||||
|
assert monitor.last_success is None
|
||||||
|
|
||||||
|
monitor.success()
|
||||||
|
|
||||||
|
assert monitor.alert_count == 0
|
||||||
|
assert monitor.last_success is not None
|
||||||
|
assert monitor.total_failure_count == 0
|
||||||
|
|
||||||
|
def test_monitor_success_back_up(self, monitor):
|
||||||
|
monitor.total_failure_count = 1
|
||||||
|
monitor.alert_count = 1
|
||||||
|
|
||||||
|
with pytest.raises(MinitorAlert):
|
||||||
|
monitor.success()
|
||||||
|
|
||||||
|
assert monitor.alert_count == 0
|
||||||
|
assert monitor.last_success is not None
|
||||||
|
assert monitor.total_failure_count == 0
|
||||||
|
Loading…
Reference in New Issue
Block a user