From e132f8660bed5f7f3487079d8505c534e41cdc48 Mon Sep 17 00:00:00 2001 From: Mitsuo Takaki Date: Fri, 25 Oct 2019 08:10:12 -0700 Subject: [PATCH] #72 - Adding the ability to control incident status --- README.md | 14 +++++ cachet_url_monitor/configuration.py | 97 +++++++++++++++++------------ cachet_url_monitor/status.py | 15 ++++- config.yml | 1 + setup.py | 2 +- tests/test_configuration.py | 4 +- 6 files changed, 88 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 240a22a..a5dde08 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ endpoint: expectation: - type: HTTP_STATUS status_range: 200-300 + incident: MAJOR - type: LATENCY threshold: 1 - type: REGEX @@ -65,6 +66,19 @@ frequency: 30 - **latency_unit**, the latency unit used when reporting the metrics. It will automatically convert to the specified unit. It's not mandatory and it will default to **seconds**. Available units: `ms`, `s`, `m`, `h`. - **frequency**, how often we'll send a request to the given URL. The unit is in seconds. +Each `expectation` has their own default incident status. It can be overridden by setting the `incident` property to any of the following values: +- `PARTIAL` +- `MAJOR` +- `PERFORMANCE` + +By choosing any of the aforementioned statuses, it will let you control the kind of incident it should be considered. These are the default incident status for each `expectation` type: + +| Expectation | Incident status | +| ----------- | --------------- | +| HTTP_STATUS | PARTIAL | +| LATENCY | PERFORMANCE | +| REGEX | PARTIAL | + ## Setting up The application should be installed using **virtualenv**, through the following command: diff --git a/cachet_url_monitor/configuration.py b/cachet_url_monitor/configuration.py index 79aefba..51a9238 100644 --- a/cachet_url_monitor/configuration.py +++ b/cachet_url_monitor/configuration.py @@ -39,7 +39,7 @@ def __init__(self, component_id): self.component_id = component_id def __str__(self): - return repr('Component with id [%d] does not exist.' % (self.component_id,)) + return repr(f'Component with id [{self.component_id}] does not exist.') class MetricNonexistentError(Exception): @@ -49,7 +49,7 @@ def __init__(self, metric_id): self.metric_id = metric_id def __str__(self): - return repr('Metric with id [%d] does not exist.' % (self.metric_id,)) + return repr(f'Metric with id [{self.metric_id}] does not exist.') def get_current_status(endpoint_url, component_id, headers): @@ -57,7 +57,7 @@ def get_current_status(endpoint_url, component_id, headers): not exist or doesn't respond with the expected data. :return component status. """ - get_status_request = requests.get('%s/components/%s' % (endpoint_url, component_id), headers=headers) + get_status_request = requests.get(f'{endpoint_url}/components/{component_id}', headers=headers) if get_status_request.ok: # The component exists. @@ -69,7 +69,7 @@ def get_current_status(endpoint_url, component_id, headers): def normalize_url(url): """If passed url doesn't include schema return it with default one - http.""" if not url.lower().startswith('http'): - return 'http://%s' % url + return f'http://{url}' return url @@ -120,7 +120,7 @@ def __init__(self, config_file): os.environ.get('CACHET_PUBLIC_INCIDENTS') or self.data['cachet']['public_incidents']) self.logger.info('Monitoring URL: %s %s' % (self.endpoint_method, self.endpoint_url)) - self.expectations = [Expectaction.create(expectation) for expectation in self.data['endpoint']['expectation']] + self.expectations = [Expectation.create(expectation) for expectation in self.data['endpoint']['expectation']] for expectation in self.expectations: self.logger.info('Registered expectation: %s' % (expectation,)) @@ -157,16 +157,15 @@ def validate(self): configuration_errors.append('%s.%s' % (key, sub_key)) if ('endpoint' in self.data and 'expectation' in - self.data['endpoint']): + self.data['endpoint']): if (not isinstance(self.data['endpoint']['expectation'], list) or (isinstance(self.data['endpoint']['expectation'], list) and - len(self.data['endpoint']['expectation']) == 0)): + len(self.data['endpoint']['expectation']) == 0)): configuration_errors.append('endpoint.expectation') if len(configuration_errors) > 0: raise ConfigurationValidationError( - 'Config file [%s] failed validation. Missing keys: %s' % (self.config_file, - ', '.join(configuration_errors))) + f"Config file [{self.config_file}] failed validation. Missing keys: {', '.join(configuration_errors)}") def evaluate(self): """Sends the request to the URL set in the configuration and executes @@ -175,9 +174,10 @@ def evaluate(self): """ try: if self.endpoint_header is not None: - self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout, headers=self.endpoint_header) + self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout, + headers=self.endpoint_header) else: - self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout) + self.request = requests.request(self.endpoint_method, self.endpoint_url, timeout=self.endpoint_timeout) self.current_timestamp = int(time.time()) except requests.ConnectionError: self.message = 'The URL is unreachable: %s %s' % (self.endpoint_method, self.endpoint_url) @@ -208,7 +208,7 @@ def evaluate(self): self.logger.info(self.message) def print_out(self): - self.logger.info('Current configuration:\n%s' % (self.__repr__())) + self.logger.info(f'Current configuration:\n{self.__repr__()}') def __repr__(self): temporary_data = copy.deepcopy(self.data) @@ -224,7 +224,7 @@ def if_trigger_update(self): if self.status != 1: self.current_fails = self.current_fails + 1 - self.logger.info('Failure #%s with threshold set to %s' % (self.current_fails, self.allowed_fails)) + self.logger.warning(f'Failure #{self.current_fails} with threshold set to {self.allowed_fails}') if self.current_fails <= self.allowed_fails: self.trigger_update = False return @@ -276,8 +276,7 @@ def push_metrics(self): # Successful metrics upload self.logger.info('Metric uploaded: %.6f %s' % (value, self.latency_unit)) else: - self.logger.warning('Metric upload failed with status [%d]' % - (metrics_request.status_code,)) + self.logger.warning(f'Metric upload failed with status [{metrics_request.status_code}]') def push_incident(self): """If the component status has changed, we create a new incident (if this is the first time it becomes unstable) @@ -291,36 +290,33 @@ def push_incident(self): 'component_status': self.status, 'notify': True} - incident_request = requests.put('%s/incidents/%d' % (self.api_url, self.incident_id), params=params, + incident_request = requests.put(f'{self.api_url}/incidents/{self.incident_id}', params=params, headers=self.headers) if incident_request.ok: # Successful metrics upload self.logger.info( - 'Incident updated, API healthy again: component status [%d], message: "%s"' % ( - self.status, self.message)) + f'Incident updated, API healthy again: component status [{self.status}], message: "{self.message}"') del self.incident_id else: - self.logger.warning('Incident update failed with status [%d], message: "%s"' % ( - incident_request.status_code, self.message)) + self.logger.warning( + f'Incident update failed with status [{incident_request.status_code}], message: "{self.message}"') elif not hasattr(self, 'incident_id') and self.status != st.COMPONENT_STATUS_OPERATIONAL: # This is the first time the incident is being created. params = {'name': 'URL unavailable', 'message': self.message, 'status': 1, 'visible': self.public_incidents, 'component_id': self.component_id, 'component_status': self.status, 'notify': True} - incident_request = requests.post('%s/incidents' % (self.api_url,), params=params, headers=self.headers) + incident_request = requests.post(f'{self.api_url}/incidents', params=params, headers=self.headers) if incident_request.ok: # Successful incident upload. self.incident_id = incident_request.json()['data']['id'] self.logger.info( - 'Incident uploaded, API unhealthy: component status [%d], message: "%s"' % ( - self.status, self.message)) + f'Incident uploaded, API unhealthy: component status [{self.status}], message: "{self.message}"') else: self.logger.warning( - 'Incident upload failed with status [%d], message: "%s"' % ( - incident_request.status_code, self.message)) + f'Incident upload failed with status [{incident_request.status_code}], message: "{self.message}"') -class Expectaction(object): - """Base class for URL result expectations. Any new excpectation should extend +class Expectation(object): + """Base class for URL result expectations. Any new expectation should extend this class and the name added to create() method. """ @@ -329,6 +325,7 @@ def create(configuration): """Creates a list of expectations based on the configuration types list. """ + # If a need expectation is created, this is where we need to add it. expectations = { 'HTTP_STATUS': HttpStatus, 'LATENCY': Latency, @@ -336,6 +333,9 @@ def create(configuration): } return expectations.get(configuration['type'])(configuration) + def __init__(self, configuration): + self.incident_status = self.parse_incident_status(configuration) + @abc.abstractmethod def get_status(self, response): """Returns the status of the API, following cachet's component status @@ -346,43 +346,58 @@ def get_status(self, response): def get_message(self, response): """Gets the error message.""" + @abc.abstractmethod + def get_default_incident(self): + """Returns the default status when this incident happens.""" + + def parse_incident_status(self, configuration): + return st.INCIDENT_MAP.get(configuration.get('incident', None), self.get_default_incident()) -class HttpStatus(Expectaction): + +class HttpStatus(Expectation): def __init__(self, configuration): self.status_range = HttpStatus.parse_range(configuration['status_range']) + super(HttpStatus, self).__init__(configuration) @staticmethod def parse_range(range_string): statuses = range_string.split("-") if len(statuses) == 1: # When there was no range given, we should treat the first number as a single status check. - return (int(statuses[0]), int(statuses[0]) + 1) + return int(statuses[0]), int(statuses[0]) + 1 else: # We shouldn't look into more than one value, as this is a range value. - return (int(statuses[0]), int(statuses[1])) + return int(statuses[0]), int(statuses[1]) def get_status(self, response): - if response.status_code >= self.status_range[0] and response.status_code < self.status_range[1]: + if self.status_range[0] <= response.status_code < self.status_range[1]: return st.COMPONENT_STATUS_OPERATIONAL else: - return st.COMPONENT_STATUS_PARTIAL_OUTAGE + return self.incident_status + + def get_default_incident(self): + return st.COMPONENT_STATUS_PARTIAL_OUTAGE def get_message(self, response): - return 'Unexpected HTTP status (%s)' % (response.status_code,) + return f'Unexpected HTTP status ({response.status_code})' def __str__(self): - return repr('HTTP status range: %s' % (self.status_range,)) + return repr(f'HTTP status range: {self.status_range}') -class Latency(Expectaction): +class Latency(Expectation): def __init__(self, configuration): self.threshold = configuration['threshold'] + super(Latency, self).__init__(configuration) def get_status(self, response): if response.elapsed.total_seconds() <= self.threshold: return st.COMPONENT_STATUS_OPERATIONAL else: - return st.COMPONENT_STATUS_PERFORMANCE_ISSUES + return self.incident_status + + def get_default_incident(self): + return st.COMPONENT_STATUS_PERFORMANCE_ISSUES def get_message(self, response): return 'Latency above threshold: %.4f seconds' % (response.elapsed.total_seconds(),) @@ -391,19 +406,23 @@ def __str__(self): return repr('Latency threshold: %.4f seconds' % (self.threshold,)) -class Regex(Expectaction): +class Regex(Expectation): def __init__(self, configuration): self.regex_string = configuration['regex'] self.regex = re.compile(configuration['regex'], re.UNICODE + re.DOTALL) + super(Regex, self).__init__(configuration) def get_status(self, response): if self.regex.match(response.text): return st.COMPONENT_STATUS_OPERATIONAL else: - return st.COMPONENT_STATUS_PARTIAL_OUTAGE + return self.incident_status + + def get_default_incident(self): + return st.COMPONENT_STATUS_PARTIAL_OUTAGE def get_message(self, response): return 'Regex did not match anything in the body' def __str__(self): - return repr('Regex: %s' % (self.regex_string,)) + return repr(f'Regex: {self.regex_string}') diff --git a/cachet_url_monitor/status.py b/cachet_url_monitor/status.py index 27a9da5..20d4061 100644 --- a/cachet_url_monitor/status.py +++ b/cachet_url_monitor/status.py @@ -4,12 +4,21 @@ These are all constants and are coupled to cachet's API configuration. """ - COMPONENT_STATUS_OPERATIONAL = 1 COMPONENT_STATUS_PERFORMANCE_ISSUES = 2 COMPONENT_STATUS_PARTIAL_OUTAGE = 3 COMPONENT_STATUS_MAJOR_OUTAGE = 4 COMPONENT_STATUSES = [COMPONENT_STATUS_OPERATIONAL, - COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE, - COMPONENT_STATUS_MAJOR_OUTAGE] + COMPONENT_STATUS_PERFORMANCE_ISSUES, COMPONENT_STATUS_PARTIAL_OUTAGE, + COMPONENT_STATUS_MAJOR_OUTAGE] + +INCIDENT_PARTIAL = 'PARTIAL' +INCIDENT_MAJOR = 'MAJOR' +INCIDENT_PERFORMANCE = 'PERFORMANCE' + +INCIDENT_MAP = { + INCIDENT_PARTIAL: COMPONENT_STATUS_PARTIAL_OUTAGE, + INCIDENT_MAJOR: COMPONENT_STATUS_MAJOR_OUTAGE, + INCIDENT_PERFORMANCE: COMPONENT_STATUS_PERFORMANCE_ISSUES, +} diff --git a/config.yml b/config.yml index 241fb73..9f02260 100644 --- a/config.yml +++ b/config.yml @@ -7,6 +7,7 @@ endpoint: expectation: - type: HTTP_STATUS status_range: 200-300 + incident: MAJOR - type: LATENCY threshold: 1 - type: REGEX diff --git a/setup.py b/setup.py index a564090..7dc26ea 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup setup(name='cachet-url-monitor', - version='1.4', + version='1.5', description='Cachet URL monitor plugin', author='Mitsuo Takaki', author_email='mitsuotakaki@gmail.com', diff --git a/tests/test_configuration.py b/tests/test_configuration.py index e0c5eb6..9beda34 100644 --- a/tests/test_configuration.py +++ b/tests/test_configuration.py @@ -96,8 +96,8 @@ def request(method, url, headers, timeout=None): sys.modules['requests'].request = request self.configuration.evaluate() - self.assertEqual(self.configuration.status, cachet_url_monitor.status.COMPONENT_STATUS_PARTIAL_OUTAGE, - 'Component status set incorrectly') + self.assertEqual(self.configuration.status, cachet_url_monitor.status.COMPONENT_STATUS_MAJOR_OUTAGE, + 'Component status set incorrectly or custom incident status is incorrectly parsed') def test_evaluate_with_timeout(self): def request(method, url, headers, timeout=None):