import logging import subprocess import sys from argparse import ArgumentParser from datetime import datetime from itertools import chain from subprocess import CalledProcessError from subprocess import check_output from time import sleep import yamlenv from prometheus_client import Counter from prometheus_client import Gauge from prometheus_client import start_http_server DEFAULT_METRICS_PORT = 8080 logging.basicConfig( level=logging.ERROR, format='%(asctime)s %(levelname)s %(name)s %(message)s' ) logging.getLogger(__name__).addHandler(logging.NullHandler()) def read_yaml(path): """Loads config from a YAML file with env interpolation""" with open(path, 'r') as yaml: contents = yaml.read() return yamlenv.load(contents) def validate_monitor_settings(settings): """Validates that settings for a Monitor are valid Note: Cannot yet validate the Alerts exist from within this class. That will be done by Minitor later """ name = settings.get('name') if not name: raise InvalidMonitorException('Invalid name for monitor') if not settings.get('command'): raise InvalidMonitorException( 'Invalid command for monitor {}'.format(name) ) type_assertions = ( ('check_interval', int), ('alert_after', int), ('alert_every', int), ) for key, val_type in type_assertions: val = settings.get(key) if not isinstance(val, val_type): raise InvalidMonitorException( 'Invalid type on {}: {}. Expected {} and found {}'.format( name, key, val_type.__name__, type(val).__name__ ) ) non_zero = ( 'check_interval', 'alert_after', ) for key in non_zero: if settings.get(key) == 0: raise InvalidMonitorException( 'Invalid value for {}: {}. Value cannot be 0'.format( name, key ) ) def maybe_decode(bstr, encoding='utf-8'): try: return bstr.decode(encoding) except TypeError: return bstr def call_output(*popenargs, **kwargs): """Similar to check_output, but instead returns output and exception""" # So we can capture complete output, redirect sderr to stdout kwargs.setdefault('stderr', subprocess.STDOUT) output, ex = None, None try: output = check_output(*popenargs, **kwargs) except CalledProcessError as e: output, ex = e.output, e output = output.rstrip(b'\n') return output, ex class InvalidAlertException(Exception): pass class InvalidMonitorException(Exception): pass class MinitorAlert(Exception): def __init__(self, message, monitor): super().__init__(message) self.monitor = monitor class Monitor(object): """Primary configuration item for Minitor""" def __init__(self, config, counter=None, logger=None): """Accepts a dictionary of configuration items to override defaults""" settings = { 'alerts': ['log'], 'check_interval': 30, 'alert_after': 4, 'alert_every': -1, } settings.update(config) validate_monitor_settings(settings) self.name = settings['name'] self.command = settings['command'] self.alert_down = settings.get('alert_down', []) if not self.alert_down: self.alert_down = settings.get('alerts', []) self.alert_up = settings.get('alert_up', []) self.check_interval = settings.get('check_interval') self.alert_after = settings.get('alert_after') self.alert_every = settings.get('alert_every') self.alert_count = 0 self.last_check = None self.last_output = None self.last_success = None self.total_failure_count = 0 self._counter = counter if logger is None: self._logger = logging.getLogger( '{}({})'.format(self.__class__.__name__, self.name) ) else: self._logger = logger.getChild( '{}({})'.format(self.__class__.__name__, self.name) ) def _count_check(self, is_success=True, is_alert=False): if self._counter is not None: self._counter.labels( monitor=self.name, status=('success' if is_success else 'failure'), is_alert=is_alert, ).inc() def should_check(self): """Determines if this Monitor should run it's check command""" if not self.last_check: return True since_last_check = (datetime.now() - self.last_check).total_seconds() return since_last_check >= self.check_interval def check(self): """Returns None if skipped, False if failed, and True if successful Will raise an exception if should alert """ if not self.should_check(): return None output, ex = call_output( self.command, shell=isinstance(self.command, str), ) output = maybe_decode(output) self._logger.debug(output) self.last_check = datetime.now() self.last_output = output is_success = None try: if ex is None: is_success = True self.success() else: is_success = False self.failure() except MinitorAlert: self._count_check(is_success=is_success, is_alert=True) raise self._count_check(is_success=is_success) return is_success def success(self): """Handles success tasks""" back_up = None if not self.is_up(): back_up = MinitorAlert( '{} check is up again!'.format(self.name), self, ) self.total_failure_count = 0 self.alert_count = 0 self.last_success = datetime.now() if back_up: raise back_up def failure(self): """Handles failure tasks and possibly raises MinitorAlert""" self.total_failure_count += 1 # Ensure we've hit the minimum number of failures to alert if self.total_failure_count < self.alert_after: return failure_count = (self.total_failure_count - self.alert_after) if self.alert_every > 0: # Otherwise, we should check against our alert_every should_alert = (failure_count % self.alert_every) == 0 elif self.alert_every == 0: # Only alert on the first failure should_alert = failure_count == 1 else: should_alert = (failure_count >= (2 ** self.alert_count) - 1) if should_alert: self.alert_count += 1 raise MinitorAlert( '{} check has failed {} times'.format( self.name, self.total_failure_count ), self ) def is_up(self): """Indicates if the monitor is already alerting failures""" return self.alert_count == 0 class Alert(object): def __init__(self, name, config, counter=None, logger=None): """An alert must be named and have a config dict""" self.name = name self.command = config.get('command') if not self.command: raise InvalidAlertException('Invalid alert {}'.format(self.name)) self._counter = counter if logger is None: self._logger = logging.getLogger( '{}({})'.format(self.__class__.__name__, self.name) ) else: self._logger = logger.getChild( '{}({})'.format(self.__class__.__name__, self.name) ) def _count_alert(self, monitor): """Increments the alert counter""" if self._counter is not None: self._counter.labels( alert=self.name, monitor=monitor, ).inc() def _formated_command(self, **kwargs): """Formats command array or string with kwargs from Monitor""" if isinstance(self.command, str): return self.command.format(**kwargs) args = [] for arg in self.command: args.append(arg.format(**kwargs)) return args def _format_datetime(self, dt): """Formats a datetime for an alert""" if dt is None: return 'Never' return dt.isoformat() def alert(self, message, monitor): """Calls the alert command for the provided monitor""" self._count_alert(monitor.name) output, ex = call_output( self._formated_command( alert_count=monitor.alert_count, alert_message=message, failure_count=monitor.total_failure_count, last_output=monitor.last_output, last_success=self._format_datetime(monitor.last_success), monitor_name=monitor.name, ), shell=isinstance(self.command, str), ) self._logger.error(maybe_decode(output)) if ex is not None: raise ex class Minitor(object): monitors = None alerts = None state = None check_interval = None def __init__(self): self._logger = logging.getLogger(self.__class__.__name__) self._alert_counter = None self._monitor_counter = None self._monitor_status_gauge = None def _parse_args(self, args=None): """Parses command line arguments and returns them""" parser = ArgumentParser(description='Minimal monitoring') parser.add_argument( '--config', '-c', dest='config_path', default='config.yml', help='Path to the config YAML file to use', ) parser.add_argument( '--metrics', '-m', dest='metrics', action='store_true', help='Start webserver with metrics', ) parser.add_argument( '--metrics-port', '-p', dest='metrics_port', type=int, default=DEFAULT_METRICS_PORT, help='Port to use when serving metrics', ) parser.add_argument( '--verbose', '-v', action='count', help=('Adjust log verbosity by increasing arg count. Default log', 'level is ERROR. Level increases with each `v`'), ) return parser.parse_args(args) def _setup(self, config_path): """Load all setup from YAML file at provided path""" config = read_yaml(config_path) self.check_interval = config.get('check_interval', 30) self.monitors = [ Monitor( mon, counter=self._monitor_counter, logger=self._logger, ) for mon in config.get('monitors', []) ] # Add default alert for logging self.alerts = { 'log': Alert( 'log', {'command': ['echo', '{alert_message}!']}, counter=self._alert_counter, logger=self._logger, ) } self.alerts.update({ alert_name: Alert( alert_name, alert, counter=self._alert_counter, logger=self._logger, ) for alert_name, alert in config.get('alerts', {}).items() }) def _validate_monitors(self): """Validates monitors are valid against other config values""" for monitor in self.monitors: # Validate that the interval is valid if monitor.check_interval < self.check_interval: raise InvalidMonitorException( 'Monitor {} check interval is lower global value {}'.format( monitor.name, self.check_interval ) ) # Validate that the the alerts for the monitor exist for alert in chain(monitor.alert_down, monitor.alert_up): if alert not in self.alerts: raise InvalidMonitorException( 'Monitor {} contains an unknown alert: {}'.format( monitor.name, alert ) ) def _init_metrics(self): self._alert_counter = Counter( 'minitor_alert_total', 'Number of Minitor alerts', ['alert', 'monitor'], ) self._monitor_counter = Counter( 'minitor_check_total', 'Number of Minitor checks', ['monitor', 'status', 'is_alert'], ) self._monitor_status_gauge = Gauge( 'minitor_monitor_up_count', 'Currently responsive monitors', ['monitor'], ) def _loop(self): while True: self._check() sleep(self.check_interval) def _check(self): """The main run loop""" for monitor in self.monitors: try: result = monitor.check() if result is not None: self._logger.info( '%s: %s', monitor.name, 'SUCCESS' if result else 'FAILURE' ) except MinitorAlert as minitor_alert: self._logger.warning(minitor_alert) self._handle_minitor_alert(minitor_alert) # Track the status of the Monitor if self._monitor_status_gauge: self._monitor_status_gauge.labels( monitor=monitor.name, ).set(int(monitor.is_up())) def _handle_minitor_alert(self, minitor_alert): """Issues all alerts for a provided monitor""" monitor = minitor_alert.monitor alerts = monitor.alert_up if monitor.is_up() else monitor.alert_down for alert in alerts: self.alerts[alert].alert(str(minitor_alert), monitor) def _set_log_level(self, verbose): """Sets the log level for the class using the provided verbose count""" if verbose == 1: self._logger.setLevel(logging.WARNING) elif verbose == 2: self._logger.setLevel(logging.INFO) elif verbose >= 3: self._logger.setLevel(logging.DEBUG) def run(self, args=None): """Runs Minitor in a loop""" args = self._parse_args(args) if args.verbose: self._set_log_level(args.verbose) if args.metrics: self._init_metrics() start_http_server(args.metrics_port) self._setup(args.config_path) self._validate_monitors() self._loop() def main(args=None): try: Minitor().run(args) except KeyboardInterrupt: pass return 0 if __name__ == '__main__': sys.exit(main())