Add new Prometheus metrics endpoint

Doubles as a health check endpoint!

Change some methods to private and rearrange them
This commit is contained in:
IamTheFij 2018-07-12 23:01:36 -07:00
parent 4f147b2e59
commit 0c0a0d9085
6 changed files with 155 additions and 50 deletions

View File

@ -5,9 +5,12 @@ LABEL maintainer="ian@iamthefij.com"
COPY ./sample-config.yml /app/config.yml
WORKDIR /app
# Expose default metrics port
EXPOSE 8080
COPY ./README.md /app/
COPY ./setup.py /app/
COPY ./minitor /app/minitor
RUN pip install -e .
ENTRYPOINT python -m minitor.main
ENTRYPOINT [ "python3", "-m", "minitor.main" ]

View File

@ -14,6 +14,11 @@ env:
run: env
./env/bin/python -m minitor.main
# Runs Minitor with metrics
.PHONY: run-metrics
run-metrics: env
./env/bin/python -m minitor.main --metrics
# Generates a smaller env for running tox, which builds it's own env
.PHONY: test-env
test-env:

View File

@ -53,6 +53,20 @@ In this repo, you can explore the `sample-config.yml` file for an example, but t
|`{last_success}`|The ISO datetime of the last successful check|
|`{monitor_name}`|The name of the monitor that failed and triggered the alert|
### Metrics
As of v0.3.0, Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
It is also possible to use the metrics endpoint for monitoring Minitor itself! This allows setting up multiple instances of Minitor on different servers and have them monitor each-other so that you can detect a minitor outage.
To run minitor with metrics, use the `--metrics` (or `-m`) flag. The metrics will be served on port `8080` by default, though it can be overriden using `--metrics-port` (or `-p`)
```
minitor --metrics
# or
minitor --metrics --metrics-port 3000
```
## Contributing
Whether you're looking to submit a patch or just tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository.

View File

@ -9,8 +9,12 @@ from subprocess import check_output
from time import sleep
import yamlenv
from prometheus_client import Counter
from prometheus_client import Gauge
from prometheus_client import start_http_server
DEFAULT_METRICS_PORT = 8080
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s'
@ -107,7 +111,7 @@ class MinitorAlert(Exception):
class Monitor(object):
"""Primary configuration item for Minitor"""
def __init__(self, config):
def __init__(self, config, counter=None):
"""Accepts a dictionary of configuration items to override defaults"""
settings = {
'alerts': ['log'],
@ -134,10 +138,19 @@ class Monitor(object):
self.last_success = None
self.total_failure_count = 0
self.logger = logging.getLogger(
self._counter = counter
self._logger = logging.getLogger(
'{}({})'.format(self.__class__.__name__, self.name)
)
def _count_check(self, is_success=True, is_alert=False):
if self._counter is not None:
self._counter.labels(
monitor=self.name,
status=('success' if is_success else 'failure'),
is_alert=is_alert,
).inc()
def should_check(self):
"""Determines if this Monitor should run it's check command"""
if not self.last_check:
@ -158,16 +171,24 @@ class Monitor(object):
shell=isinstance(self.command, str),
)
output = maybe_decode(output)
self.logger.debug(output)
self._logger.debug(output)
self.last_check = datetime.now()
self.last_output = output
if ex is None:
self.success()
return True
else:
self.failure()
return False
is_success = None
try:
if ex is None:
is_success = True
self.success()
else:
is_success = False
self.failure()
except MinitorAlert:
self._count_check(is_success=is_success, is_alert=True)
raise
self._count_check(is_success=is_success)
return is_success
def success(self):
"""Handles success tasks"""
@ -212,17 +233,26 @@ class Monitor(object):
class Alert(object):
def __init__(self, name, config):
def __init__(self, name, config, counter=None):
"""An alert must be named and have a config dict"""
self.name = name
self.command = config.get('command')
if not self.command:
raise InvalidAlertException('Invalid alert {}'.format(self.name))
self.logger = logging.getLogger(
self._counter = counter
self._logger = logging.getLogger(
'{}({})'.format(self.__class__.__name__, self.name)
)
def _count_alert(self, monitor):
"""Increments the alert counter"""
if self._counter is not None:
self._counter.labels(
alert=self.name,
monitor=monitor,
).inc()
def _formated_command(self, **kwargs):
"""Formats command array or string with kwargs from Monitor"""
if isinstance(self.command, str):
@ -240,6 +270,7 @@ class Alert(object):
def alert(self, message, monitor):
"""Calls the alert command for the provided monitor"""
self._count_alert(monitor.name)
output, ex = call_output(
self._formated_command(
alert_count=monitor.alert_count,
@ -251,7 +282,7 @@ class Alert(object):
),
shell=isinstance(self.command, str),
)
self.logger.error(maybe_decode(output))
self._logger.error(maybe_decode(output))
if ex is not None:
raise ex
@ -263,26 +294,57 @@ class Minitor(object):
check_interval = None
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
self._logger = logging.getLogger(self.__class__.__name__)
self._alert_counter = None
self._monitor_counter = None
self._monitor_status_gauge = None
def setup(self, config_path):
def _parse_args(self):
"""Parses command line arguments and returns them"""
parser = ArgumentParser(description='Minimal monitoring')
parser.add_argument(
'--config', '-c',
dest='config_path',
default='config.yml',
help='Path to the config YAML file to use',
)
parser.add_argument(
'--metrics', '-m',
dest='metrics',
action='store_true',
help='Start webserver with metrics',
)
parser.add_argument(
'--metrics-port', '-p',
dest='metrics_port',
type=int,
default=DEFAULT_METRICS_PORT,
help='Port to use when serving metrics',
)
return parser.parse_args()
def _setup(self, config_path):
"""Load all setup from YAML file at provided path"""
config = read_yaml(config_path)
self.check_interval = config.get('check_interval', 30)
self.monitors = [Monitor(mon) for mon in config.get('monitors', [])]
self.monitors = [
Monitor(mon, counter=self._monitor_counter)
for mon in config.get('monitors', [])
]
# Add default alert for logging
self.alerts = {
'log': Alert(
'log',
{'command': ['echo', '{alert_message}!']}
{'command': ['echo', '{alert_message}!']},
counter=self._alert_counter,
)
}
self.alerts.update({
alert_name: Alert(alert_name, alert)
alert_name: Alert(alert_name, alert, counter=self._alert_counter)
for alert_name, alert in config.get('alerts', {}).items()
})
def validate_monitors(self):
def _validate_monitors(self):
"""Validates monitors are valid against other config values"""
for monitor in self.monitors:
# Validate that the interval is valid
@ -301,45 +363,65 @@ class Minitor(object):
)
)
def handle_minitor_alert(self, minitor_alert):
def _init_metrics(self):
self._alert_counter = Counter(
'minitor_alert_total',
'Number of Minitor alerts',
['alert', 'monitor'],
)
self._monitor_counter = Counter(
'minitor_check_total',
'Number of Minitor checks',
['monitor', 'status', 'is_alert'],
)
self._monitor_status_gauge = Gauge(
'minitor_monitor_up_count',
'Currently responsive monitors',
['monitor'],
)
def _loop(self):
"""The main run loop"""
while True:
for monitor in self.monitors:
try:
result = monitor.check()
if result is not None:
self._logger.info(
'%s: %s',
monitor.name,
'SUCCESS' if result else 'FAILURE'
)
except MinitorAlert as minitor_alert:
self._logger.warning(minitor_alert)
self._handle_minitor_alert(minitor_alert)
# Track the status of the Monitor
if self._monitor_status_gauge:
self._monitor_status_gauge.labels(
monitor=monitor.name,
).set(int(monitor.is_up()))
sleep(self.check_interval)
def _handle_minitor_alert(self, minitor_alert):
"""Issues all alerts for a provided monitor"""
monitor = minitor_alert.monitor
alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down
for alert in alerts:
self.alerts[alert].alert(str(minitor_alert), monitor)
def parse_args(self):
"""Parses command line arguments and returns them"""
parser = ArgumentParser(description='Minimal monitoring')
parser.add_argument(
'--config', '-c',
dest='config_path',
default='config.yml',
help='Path to the config YAML file to use',
)
return parser.parse_args()
def run(self):
"""Runs Minitor in a loop"""
args = self.parse_args()
self.setup(args.config_path)
self.validate_monitors()
args = self._parse_args()
self._setup(args.config_path)
self._validate_monitors()
while True:
for monitor in self.monitors:
try:
result = monitor.check()
if result is not None:
self.logger.info(
'%s: %s',
monitor.name,
'SUCCESS' if result else 'FAILURE'
)
except MinitorAlert as minitor_alert:
self.logger.warning(minitor_alert)
self.handle_minitor_alert(minitor_alert)
if args.metrics:
self._init_metrics()
start_http_server(args.metrics_port)
sleep(self.check_interval)
self._loop()
def main():

View File

@ -12,7 +12,7 @@ with open(path.join(here, 'README.md'), encoding='utf-8') as f:
setup(
name='minitor',
version='0.2.1',
version='0.3.0',
description='A minimal monitoring tool',
long_description=long_description,
url='https://git.iamthefij.com/iamthefij/minitor',
@ -38,6 +38,7 @@ setup(
keywords='minitor monitoring alerting',
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
install_requires=[
'prometheus_client',
'yamlenv',
],
entry_points={

View File

@ -50,7 +50,7 @@ class TestAlert(object):
monitor.last_output = 'beep boop'
monitor.last_success = last_success
monitor.total_failure_count = 1
with patch.object(echo_alert.logger, 'error') as mock_error:
with patch.object(echo_alert._logger, 'error') as mock_error:
echo_alert.alert('Exception message', monitor)
mock_error.assert_called_once_with(
'Dummy Monitor has failed 1 time(s)!\n'