Add new Prometheus metrics endpoint

Doubles as a health check endpoint!

Change some methods to private and rearrange them
This commit is contained in:
IamTheFij 2018-07-12 23:01:36 -07:00
parent 4f147b2e59
commit 0c0a0d9085
6 changed files with 155 additions and 50 deletions

View File

@ -5,9 +5,12 @@ LABEL maintainer="ian@iamthefij.com"
COPY ./sample-config.yml /app/config.yml COPY ./sample-config.yml /app/config.yml
WORKDIR /app WORKDIR /app
# Expose default metrics port
EXPOSE 8080
COPY ./README.md /app/ COPY ./README.md /app/
COPY ./setup.py /app/ COPY ./setup.py /app/
COPY ./minitor /app/minitor COPY ./minitor /app/minitor
RUN pip install -e . RUN pip install -e .
ENTRYPOINT python -m minitor.main ENTRYPOINT [ "python3", "-m", "minitor.main" ]

View File

@ -14,6 +14,11 @@ env:
run: env run: env
./env/bin/python -m minitor.main ./env/bin/python -m minitor.main
# Runs Minitor with metrics
.PHONY: run-metrics
run-metrics: env
./env/bin/python -m minitor.main --metrics
# Generates a smaller env for running tox, which builds it's own env # Generates a smaller env for running tox, which builds it's own env
.PHONY: test-env .PHONY: test-env
test-env: test-env:

View File

@ -53,6 +53,20 @@ In this repo, you can explore the `sample-config.yml` file for an example, but t
|`{last_success}`|The ISO datetime of the last successful check| |`{last_success}`|The ISO datetime of the last successful check|
|`{monitor_name}`|The name of the monitor that failed and triggered the alert| |`{monitor_name}`|The name of the monitor that failed and triggered the alert|
### Metrics
As of v0.3.0, Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
It is also possible to use the metrics endpoint for monitoring Minitor itself! This allows setting up multiple instances of Minitor on different servers and have them monitor each-other so that you can detect a minitor outage.
To run minitor with metrics, use the `--metrics` (or `-m`) flag. The metrics will be served on port `8080` by default, though it can be overriden using `--metrics-port` (or `-p`)
```
minitor --metrics
# or
minitor --metrics --metrics-port 3000
```
## Contributing ## Contributing
Whether you're looking to submit a patch or just tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository. Whether you're looking to submit a patch or just tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository.

View File

@ -9,8 +9,12 @@ from subprocess import check_output
from time import sleep from time import sleep
import yamlenv import yamlenv
from prometheus_client import Counter
from prometheus_client import Gauge
from prometheus_client import start_http_server
DEFAULT_METRICS_PORT = 8080
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s' format='%(asctime)s %(levelname)s %(name)s %(message)s'
@ -107,7 +111,7 @@ class MinitorAlert(Exception):
class Monitor(object): class Monitor(object):
"""Primary configuration item for Minitor""" """Primary configuration item for Minitor"""
def __init__(self, config): def __init__(self, config, counter=None):
"""Accepts a dictionary of configuration items to override defaults""" """Accepts a dictionary of configuration items to override defaults"""
settings = { settings = {
'alerts': ['log'], 'alerts': ['log'],
@ -134,10 +138,19 @@ class Monitor(object):
self.last_success = None self.last_success = None
self.total_failure_count = 0 self.total_failure_count = 0
self.logger = logging.getLogger( self._counter = counter
self._logger = logging.getLogger(
'{}({})'.format(self.__class__.__name__, self.name) '{}({})'.format(self.__class__.__name__, self.name)
) )
def _count_check(self, is_success=True, is_alert=False):
if self._counter is not None:
self._counter.labels(
monitor=self.name,
status=('success' if is_success else 'failure'),
is_alert=is_alert,
).inc()
def should_check(self): def should_check(self):
"""Determines if this Monitor should run it's check command""" """Determines if this Monitor should run it's check command"""
if not self.last_check: if not self.last_check:
@ -158,16 +171,24 @@ class Monitor(object):
shell=isinstance(self.command, str), shell=isinstance(self.command, str),
) )
output = maybe_decode(output) output = maybe_decode(output)
self.logger.debug(output) self._logger.debug(output)
self.last_check = datetime.now() self.last_check = datetime.now()
self.last_output = output self.last_output = output
if ex is None: is_success = None
self.success() try:
return True if ex is None:
else: is_success = True
self.failure() self.success()
return False else:
is_success = False
self.failure()
except MinitorAlert:
self._count_check(is_success=is_success, is_alert=True)
raise
self._count_check(is_success=is_success)
return is_success
def success(self): def success(self):
"""Handles success tasks""" """Handles success tasks"""
@ -212,17 +233,26 @@ class Monitor(object):
class Alert(object): class Alert(object):
def __init__(self, name, config): def __init__(self, name, config, counter=None):
"""An alert must be named and have a config dict""" """An alert must be named and have a config dict"""
self.name = name self.name = name
self.command = config.get('command') self.command = config.get('command')
if not self.command: if not self.command:
raise InvalidAlertException('Invalid alert {}'.format(self.name)) raise InvalidAlertException('Invalid alert {}'.format(self.name))
self.logger = logging.getLogger( self._counter = counter
self._logger = logging.getLogger(
'{}({})'.format(self.__class__.__name__, self.name) '{}({})'.format(self.__class__.__name__, self.name)
) )
def _count_alert(self, monitor):
"""Increments the alert counter"""
if self._counter is not None:
self._counter.labels(
alert=self.name,
monitor=monitor,
).inc()
def _formated_command(self, **kwargs): def _formated_command(self, **kwargs):
"""Formats command array or string with kwargs from Monitor""" """Formats command array or string with kwargs from Monitor"""
if isinstance(self.command, str): if isinstance(self.command, str):
@ -240,6 +270,7 @@ class Alert(object):
def alert(self, message, monitor): def alert(self, message, monitor):
"""Calls the alert command for the provided monitor""" """Calls the alert command for the provided monitor"""
self._count_alert(monitor.name)
output, ex = call_output( output, ex = call_output(
self._formated_command( self._formated_command(
alert_count=monitor.alert_count, alert_count=monitor.alert_count,
@ -251,7 +282,7 @@ class Alert(object):
), ),
shell=isinstance(self.command, str), shell=isinstance(self.command, str),
) )
self.logger.error(maybe_decode(output)) self._logger.error(maybe_decode(output))
if ex is not None: if ex is not None:
raise ex raise ex
@ -263,26 +294,57 @@ class Minitor(object):
check_interval = None check_interval = None
def __init__(self): def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__) self._logger = logging.getLogger(self.__class__.__name__)
self._alert_counter = None
self._monitor_counter = None
self._monitor_status_gauge = None
def setup(self, config_path): def _parse_args(self):
"""Parses command line arguments and returns them"""
parser = ArgumentParser(description='Minimal monitoring')
parser.add_argument(
'--config', '-c',
dest='config_path',
default='config.yml',
help='Path to the config YAML file to use',
)
parser.add_argument(
'--metrics', '-m',
dest='metrics',
action='store_true',
help='Start webserver with metrics',
)
parser.add_argument(
'--metrics-port', '-p',
dest='metrics_port',
type=int,
default=DEFAULT_METRICS_PORT,
help='Port to use when serving metrics',
)
return parser.parse_args()
def _setup(self, config_path):
"""Load all setup from YAML file at provided path""" """Load all setup from YAML file at provided path"""
config = read_yaml(config_path) config = read_yaml(config_path)
self.check_interval = config.get('check_interval', 30) self.check_interval = config.get('check_interval', 30)
self.monitors = [Monitor(mon) for mon in config.get('monitors', [])] self.monitors = [
Monitor(mon, counter=self._monitor_counter)
for mon in config.get('monitors', [])
]
# Add default alert for logging # Add default alert for logging
self.alerts = { self.alerts = {
'log': Alert( 'log': Alert(
'log', 'log',
{'command': ['echo', '{alert_message}!']} {'command': ['echo', '{alert_message}!']},
counter=self._alert_counter,
) )
} }
self.alerts.update({ self.alerts.update({
alert_name: Alert(alert_name, alert) alert_name: Alert(alert_name, alert, counter=self._alert_counter)
for alert_name, alert in config.get('alerts', {}).items() for alert_name, alert in config.get('alerts', {}).items()
}) })
def validate_monitors(self): def _validate_monitors(self):
"""Validates monitors are valid against other config values""" """Validates monitors are valid against other config values"""
for monitor in self.monitors: for monitor in self.monitors:
# Validate that the interval is valid # Validate that the interval is valid
@ -301,45 +363,65 @@ class Minitor(object):
) )
) )
def handle_minitor_alert(self, minitor_alert): def _init_metrics(self):
self._alert_counter = Counter(
'minitor_alert_total',
'Number of Minitor alerts',
['alert', 'monitor'],
)
self._monitor_counter = Counter(
'minitor_check_total',
'Number of Minitor checks',
['monitor', 'status', 'is_alert'],
)
self._monitor_status_gauge = Gauge(
'minitor_monitor_up_count',
'Currently responsive monitors',
['monitor'],
)
def _loop(self):
"""The main run loop"""
while True:
for monitor in self.monitors:
try:
result = monitor.check()
if result is not None:
self._logger.info(
'%s: %s',
monitor.name,
'SUCCESS' if result else 'FAILURE'
)
except MinitorAlert as minitor_alert:
self._logger.warning(minitor_alert)
self._handle_minitor_alert(minitor_alert)
# Track the status of the Monitor
if self._monitor_status_gauge:
self._monitor_status_gauge.labels(
monitor=monitor.name,
).set(int(monitor.is_up()))
sleep(self.check_interval)
def _handle_minitor_alert(self, minitor_alert):
"""Issues all alerts for a provided monitor""" """Issues all alerts for a provided monitor"""
monitor = minitor_alert.monitor monitor = minitor_alert.monitor
alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down
for alert in alerts: for alert in alerts:
self.alerts[alert].alert(str(minitor_alert), monitor) self.alerts[alert].alert(str(minitor_alert), monitor)
def parse_args(self):
"""Parses command line arguments and returns them"""
parser = ArgumentParser(description='Minimal monitoring')
parser.add_argument(
'--config', '-c',
dest='config_path',
default='config.yml',
help='Path to the config YAML file to use',
)
return parser.parse_args()
def run(self): def run(self):
"""Runs Minitor in a loop""" """Runs Minitor in a loop"""
args = self.parse_args() args = self._parse_args()
self.setup(args.config_path) self._setup(args.config_path)
self.validate_monitors() self._validate_monitors()
while True: if args.metrics:
for monitor in self.monitors: self._init_metrics()
try: start_http_server(args.metrics_port)
result = monitor.check()
if result is not None:
self.logger.info(
'%s: %s',
monitor.name,
'SUCCESS' if result else 'FAILURE'
)
except MinitorAlert as minitor_alert:
self.logger.warning(minitor_alert)
self.handle_minitor_alert(minitor_alert)
sleep(self.check_interval) self._loop()
def main(): def main():

View File

@ -12,7 +12,7 @@ with open(path.join(here, 'README.md'), encoding='utf-8') as f:
setup( setup(
name='minitor', name='minitor',
version='0.2.1', version='0.3.0',
description='A minimal monitoring tool', description='A minimal monitoring tool',
long_description=long_description, long_description=long_description,
url='https://git.iamthefij.com/iamthefij/minitor', url='https://git.iamthefij.com/iamthefij/minitor',
@ -38,6 +38,7 @@ setup(
keywords='minitor monitoring alerting', keywords='minitor monitoring alerting',
packages=find_packages(exclude=['contrib', 'docs', 'tests']), packages=find_packages(exclude=['contrib', 'docs', 'tests']),
install_requires=[ install_requires=[
'prometheus_client',
'yamlenv', 'yamlenv',
], ],
entry_points={ entry_points={

View File

@ -50,7 +50,7 @@ class TestAlert(object):
monitor.last_output = 'beep boop' monitor.last_output = 'beep boop'
monitor.last_success = last_success monitor.last_success = last_success
monitor.total_failure_count = 1 monitor.total_failure_count = 1
with patch.object(echo_alert.logger, 'error') as mock_error: with patch.object(echo_alert._logger, 'error') as mock_error:
echo_alert.alert('Exception message', monitor) echo_alert.alert('Exception message', monitor)
mock_error.assert_called_once_with( mock_error.assert_called_once_with(
'Dummy Monitor has failed 1 time(s)!\n' 'Dummy Monitor has failed 1 time(s)!\n'