Add new Prometheus metrics endpoint
Doubles as a health check endpoint! Change some methods to private and rearrange them
This commit is contained in:
parent
4f147b2e59
commit
0c0a0d9085
@ -5,9 +5,12 @@ LABEL maintainer="ian@iamthefij.com"
|
|||||||
COPY ./sample-config.yml /app/config.yml
|
COPY ./sample-config.yml /app/config.yml
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Expose default metrics port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
COPY ./README.md /app/
|
COPY ./README.md /app/
|
||||||
COPY ./setup.py /app/
|
COPY ./setup.py /app/
|
||||||
COPY ./minitor /app/minitor
|
COPY ./minitor /app/minitor
|
||||||
RUN pip install -e .
|
RUN pip install -e .
|
||||||
|
|
||||||
ENTRYPOINT python -m minitor.main
|
ENTRYPOINT [ "python3", "-m", "minitor.main" ]
|
||||||
|
5
Makefile
5
Makefile
@ -14,6 +14,11 @@ env:
|
|||||||
run: env
|
run: env
|
||||||
./env/bin/python -m minitor.main
|
./env/bin/python -m minitor.main
|
||||||
|
|
||||||
|
# Runs Minitor with metrics
|
||||||
|
.PHONY: run-metrics
|
||||||
|
run-metrics: env
|
||||||
|
./env/bin/python -m minitor.main --metrics
|
||||||
|
|
||||||
# Generates a smaller env for running tox, which builds it's own env
|
# Generates a smaller env for running tox, which builds it's own env
|
||||||
.PHONY: test-env
|
.PHONY: test-env
|
||||||
test-env:
|
test-env:
|
||||||
|
14
README.md
14
README.md
@ -53,6 +53,20 @@ In this repo, you can explore the `sample-config.yml` file for an example, but t
|
|||||||
|`{last_success}`|The ISO datetime of the last successful check|
|
|`{last_success}`|The ISO datetime of the last successful check|
|
||||||
|`{monitor_name}`|The name of the monitor that failed and triggered the alert|
|
|`{monitor_name}`|The name of the monitor that failed and triggered the alert|
|
||||||
|
|
||||||
|
### Metrics
|
||||||
|
|
||||||
|
As of v0.3.0, Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
|
||||||
|
|
||||||
|
It is also possible to use the metrics endpoint for monitoring Minitor itself! This allows setting up multiple instances of Minitor on different servers and have them monitor each-other so that you can detect a minitor outage.
|
||||||
|
|
||||||
|
To run minitor with metrics, use the `--metrics` (or `-m`) flag. The metrics will be served on port `8080` by default, though it can be overriden using `--metrics-port` (or `-p`)
|
||||||
|
|
||||||
|
```
|
||||||
|
minitor --metrics
|
||||||
|
# or
|
||||||
|
minitor --metrics --metrics-port 3000
|
||||||
|
```
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
Whether you're looking to submit a patch or just tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository.
|
Whether you're looking to submit a patch or just tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository.
|
||||||
|
168
minitor/main.py
168
minitor/main.py
@ -9,8 +9,12 @@ from subprocess import check_output
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
import yamlenv
|
import yamlenv
|
||||||
|
from prometheus_client import Counter
|
||||||
|
from prometheus_client import Gauge
|
||||||
|
from prometheus_client import start_http_server
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_METRICS_PORT = 8080
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s %(levelname)s %(name)s %(message)s'
|
format='%(asctime)s %(levelname)s %(name)s %(message)s'
|
||||||
@ -107,7 +111,7 @@ class MinitorAlert(Exception):
|
|||||||
class Monitor(object):
|
class Monitor(object):
|
||||||
"""Primary configuration item for Minitor"""
|
"""Primary configuration item for Minitor"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config, counter=None):
|
||||||
"""Accepts a dictionary of configuration items to override defaults"""
|
"""Accepts a dictionary of configuration items to override defaults"""
|
||||||
settings = {
|
settings = {
|
||||||
'alerts': ['log'],
|
'alerts': ['log'],
|
||||||
@ -134,10 +138,19 @@ class Monitor(object):
|
|||||||
self.last_success = None
|
self.last_success = None
|
||||||
self.total_failure_count = 0
|
self.total_failure_count = 0
|
||||||
|
|
||||||
self.logger = logging.getLogger(
|
self._counter = counter
|
||||||
|
self._logger = logging.getLogger(
|
||||||
'{}({})'.format(self.__class__.__name__, self.name)
|
'{}({})'.format(self.__class__.__name__, self.name)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _count_check(self, is_success=True, is_alert=False):
|
||||||
|
if self._counter is not None:
|
||||||
|
self._counter.labels(
|
||||||
|
monitor=self.name,
|
||||||
|
status=('success' if is_success else 'failure'),
|
||||||
|
is_alert=is_alert,
|
||||||
|
).inc()
|
||||||
|
|
||||||
def should_check(self):
|
def should_check(self):
|
||||||
"""Determines if this Monitor should run it's check command"""
|
"""Determines if this Monitor should run it's check command"""
|
||||||
if not self.last_check:
|
if not self.last_check:
|
||||||
@ -158,16 +171,24 @@ class Monitor(object):
|
|||||||
shell=isinstance(self.command, str),
|
shell=isinstance(self.command, str),
|
||||||
)
|
)
|
||||||
output = maybe_decode(output)
|
output = maybe_decode(output)
|
||||||
self.logger.debug(output)
|
self._logger.debug(output)
|
||||||
self.last_check = datetime.now()
|
self.last_check = datetime.now()
|
||||||
self.last_output = output
|
self.last_output = output
|
||||||
|
|
||||||
|
is_success = None
|
||||||
|
try:
|
||||||
if ex is None:
|
if ex is None:
|
||||||
|
is_success = True
|
||||||
self.success()
|
self.success()
|
||||||
return True
|
|
||||||
else:
|
else:
|
||||||
|
is_success = False
|
||||||
self.failure()
|
self.failure()
|
||||||
return False
|
except MinitorAlert:
|
||||||
|
self._count_check(is_success=is_success, is_alert=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
self._count_check(is_success=is_success)
|
||||||
|
return is_success
|
||||||
|
|
||||||
def success(self):
|
def success(self):
|
||||||
"""Handles success tasks"""
|
"""Handles success tasks"""
|
||||||
@ -212,17 +233,26 @@ class Monitor(object):
|
|||||||
|
|
||||||
|
|
||||||
class Alert(object):
|
class Alert(object):
|
||||||
def __init__(self, name, config):
|
def __init__(self, name, config, counter=None):
|
||||||
"""An alert must be named and have a config dict"""
|
"""An alert must be named and have a config dict"""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.command = config.get('command')
|
self.command = config.get('command')
|
||||||
if not self.command:
|
if not self.command:
|
||||||
raise InvalidAlertException('Invalid alert {}'.format(self.name))
|
raise InvalidAlertException('Invalid alert {}'.format(self.name))
|
||||||
|
|
||||||
self.logger = logging.getLogger(
|
self._counter = counter
|
||||||
|
self._logger = logging.getLogger(
|
||||||
'{}({})'.format(self.__class__.__name__, self.name)
|
'{}({})'.format(self.__class__.__name__, self.name)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _count_alert(self, monitor):
|
||||||
|
"""Increments the alert counter"""
|
||||||
|
if self._counter is not None:
|
||||||
|
self._counter.labels(
|
||||||
|
alert=self.name,
|
||||||
|
monitor=monitor,
|
||||||
|
).inc()
|
||||||
|
|
||||||
def _formated_command(self, **kwargs):
|
def _formated_command(self, **kwargs):
|
||||||
"""Formats command array or string with kwargs from Monitor"""
|
"""Formats command array or string with kwargs from Monitor"""
|
||||||
if isinstance(self.command, str):
|
if isinstance(self.command, str):
|
||||||
@ -240,6 +270,7 @@ class Alert(object):
|
|||||||
|
|
||||||
def alert(self, message, monitor):
|
def alert(self, message, monitor):
|
||||||
"""Calls the alert command for the provided monitor"""
|
"""Calls the alert command for the provided monitor"""
|
||||||
|
self._count_alert(monitor.name)
|
||||||
output, ex = call_output(
|
output, ex = call_output(
|
||||||
self._formated_command(
|
self._formated_command(
|
||||||
alert_count=monitor.alert_count,
|
alert_count=monitor.alert_count,
|
||||||
@ -251,7 +282,7 @@ class Alert(object):
|
|||||||
),
|
),
|
||||||
shell=isinstance(self.command, str),
|
shell=isinstance(self.command, str),
|
||||||
)
|
)
|
||||||
self.logger.error(maybe_decode(output))
|
self._logger.error(maybe_decode(output))
|
||||||
if ex is not None:
|
if ex is not None:
|
||||||
raise ex
|
raise ex
|
||||||
|
|
||||||
@ -263,26 +294,57 @@ class Minitor(object):
|
|||||||
check_interval = None
|
check_interval = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self._logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
self._alert_counter = None
|
||||||
|
self._monitor_counter = None
|
||||||
|
self._monitor_status_gauge = None
|
||||||
|
|
||||||
def setup(self, config_path):
|
def _parse_args(self):
|
||||||
|
"""Parses command line arguments and returns them"""
|
||||||
|
parser = ArgumentParser(description='Minimal monitoring')
|
||||||
|
parser.add_argument(
|
||||||
|
'--config', '-c',
|
||||||
|
dest='config_path',
|
||||||
|
default='config.yml',
|
||||||
|
help='Path to the config YAML file to use',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--metrics', '-m',
|
||||||
|
dest='metrics',
|
||||||
|
action='store_true',
|
||||||
|
help='Start webserver with metrics',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--metrics-port', '-p',
|
||||||
|
dest='metrics_port',
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_METRICS_PORT,
|
||||||
|
help='Port to use when serving metrics',
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def _setup(self, config_path):
|
||||||
"""Load all setup from YAML file at provided path"""
|
"""Load all setup from YAML file at provided path"""
|
||||||
config = read_yaml(config_path)
|
config = read_yaml(config_path)
|
||||||
self.check_interval = config.get('check_interval', 30)
|
self.check_interval = config.get('check_interval', 30)
|
||||||
self.monitors = [Monitor(mon) for mon in config.get('monitors', [])]
|
self.monitors = [
|
||||||
|
Monitor(mon, counter=self._monitor_counter)
|
||||||
|
for mon in config.get('monitors', [])
|
||||||
|
]
|
||||||
# Add default alert for logging
|
# Add default alert for logging
|
||||||
self.alerts = {
|
self.alerts = {
|
||||||
'log': Alert(
|
'log': Alert(
|
||||||
'log',
|
'log',
|
||||||
{'command': ['echo', '{alert_message}!']}
|
{'command': ['echo', '{alert_message}!']},
|
||||||
|
counter=self._alert_counter,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
self.alerts.update({
|
self.alerts.update({
|
||||||
alert_name: Alert(alert_name, alert)
|
alert_name: Alert(alert_name, alert, counter=self._alert_counter)
|
||||||
for alert_name, alert in config.get('alerts', {}).items()
|
for alert_name, alert in config.get('alerts', {}).items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def validate_monitors(self):
|
def _validate_monitors(self):
|
||||||
"""Validates monitors are valid against other config values"""
|
"""Validates monitors are valid against other config values"""
|
||||||
for monitor in self.monitors:
|
for monitor in self.monitors:
|
||||||
# Validate that the interval is valid
|
# Validate that the interval is valid
|
||||||
@ -301,45 +363,65 @@ class Minitor(object):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_minitor_alert(self, minitor_alert):
|
def _init_metrics(self):
|
||||||
|
self._alert_counter = Counter(
|
||||||
|
'minitor_alert_total',
|
||||||
|
'Number of Minitor alerts',
|
||||||
|
['alert', 'monitor'],
|
||||||
|
)
|
||||||
|
self._monitor_counter = Counter(
|
||||||
|
'minitor_check_total',
|
||||||
|
'Number of Minitor checks',
|
||||||
|
['monitor', 'status', 'is_alert'],
|
||||||
|
)
|
||||||
|
self._monitor_status_gauge = Gauge(
|
||||||
|
'minitor_monitor_up_count',
|
||||||
|
'Currently responsive monitors',
|
||||||
|
['monitor'],
|
||||||
|
)
|
||||||
|
|
||||||
|
def _loop(self):
|
||||||
|
"""The main run loop"""
|
||||||
|
while True:
|
||||||
|
for monitor in self.monitors:
|
||||||
|
try:
|
||||||
|
result = monitor.check()
|
||||||
|
if result is not None:
|
||||||
|
self._logger.info(
|
||||||
|
'%s: %s',
|
||||||
|
monitor.name,
|
||||||
|
'SUCCESS' if result else 'FAILURE'
|
||||||
|
)
|
||||||
|
except MinitorAlert as minitor_alert:
|
||||||
|
self._logger.warning(minitor_alert)
|
||||||
|
self._handle_minitor_alert(minitor_alert)
|
||||||
|
|
||||||
|
# Track the status of the Monitor
|
||||||
|
if self._monitor_status_gauge:
|
||||||
|
self._monitor_status_gauge.labels(
|
||||||
|
monitor=monitor.name,
|
||||||
|
).set(int(monitor.is_up()))
|
||||||
|
|
||||||
|
sleep(self.check_interval)
|
||||||
|
|
||||||
|
def _handle_minitor_alert(self, minitor_alert):
|
||||||
"""Issues all alerts for a provided monitor"""
|
"""Issues all alerts for a provided monitor"""
|
||||||
monitor = minitor_alert.monitor
|
monitor = minitor_alert.monitor
|
||||||
alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down
|
alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down
|
||||||
for alert in alerts:
|
for alert in alerts:
|
||||||
self.alerts[alert].alert(str(minitor_alert), monitor)
|
self.alerts[alert].alert(str(minitor_alert), monitor)
|
||||||
|
|
||||||
def parse_args(self):
|
|
||||||
"""Parses command line arguments and returns them"""
|
|
||||||
parser = ArgumentParser(description='Minimal monitoring')
|
|
||||||
parser.add_argument(
|
|
||||||
'--config', '-c',
|
|
||||||
dest='config_path',
|
|
||||||
default='config.yml',
|
|
||||||
help='Path to the config YAML file to use',
|
|
||||||
)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
"""Runs Minitor in a loop"""
|
"""Runs Minitor in a loop"""
|
||||||
args = self.parse_args()
|
args = self._parse_args()
|
||||||
self.setup(args.config_path)
|
self._setup(args.config_path)
|
||||||
self.validate_monitors()
|
self._validate_monitors()
|
||||||
|
|
||||||
while True:
|
if args.metrics:
|
||||||
for monitor in self.monitors:
|
self._init_metrics()
|
||||||
try:
|
start_http_server(args.metrics_port)
|
||||||
result = monitor.check()
|
|
||||||
if result is not None:
|
|
||||||
self.logger.info(
|
|
||||||
'%s: %s',
|
|
||||||
monitor.name,
|
|
||||||
'SUCCESS' if result else 'FAILURE'
|
|
||||||
)
|
|
||||||
except MinitorAlert as minitor_alert:
|
|
||||||
self.logger.warning(minitor_alert)
|
|
||||||
self.handle_minitor_alert(minitor_alert)
|
|
||||||
|
|
||||||
sleep(self.check_interval)
|
self._loop()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
3
setup.py
3
setup.py
@ -12,7 +12,7 @@ with open(path.join(here, 'README.md'), encoding='utf-8') as f:
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='minitor',
|
name='minitor',
|
||||||
version='0.2.1',
|
version='0.3.0',
|
||||||
description='A minimal monitoring tool',
|
description='A minimal monitoring tool',
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
url='https://git.iamthefij.com/iamthefij/minitor',
|
url='https://git.iamthefij.com/iamthefij/minitor',
|
||||||
@ -38,6 +38,7 @@ setup(
|
|||||||
keywords='minitor monitoring alerting',
|
keywords='minitor monitoring alerting',
|
||||||
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
|
packages=find_packages(exclude=['contrib', 'docs', 'tests']),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
|
'prometheus_client',
|
||||||
'yamlenv',
|
'yamlenv',
|
||||||
],
|
],
|
||||||
entry_points={
|
entry_points={
|
||||||
|
@ -50,7 +50,7 @@ class TestAlert(object):
|
|||||||
monitor.last_output = 'beep boop'
|
monitor.last_output = 'beep boop'
|
||||||
monitor.last_success = last_success
|
monitor.last_success = last_success
|
||||||
monitor.total_failure_count = 1
|
monitor.total_failure_count = 1
|
||||||
with patch.object(echo_alert.logger, 'error') as mock_error:
|
with patch.object(echo_alert._logger, 'error') as mock_error:
|
||||||
echo_alert.alert('Exception message', monitor)
|
echo_alert.alert('Exception message', monitor)
|
||||||
mock_error.assert_called_once_with(
|
mock_error.assert_called_once_with(
|
||||||
'Dummy Monitor has failed 1 time(s)!\n'
|
'Dummy Monitor has failed 1 time(s)!\n'
|
||||||
|
Loading…
Reference in New Issue
Block a user