# Copyright (C) 2005-2024 Splunk Inc. All Rights Reserved. import csv import copy import collections from .custom_threshold_window import CustomThresholdWindow from .chunked_util import die, add_message, read_chunk, write_chunk from datetime import datetime, timedelta from .kpi import KPIBase, ServiceKPI, TempKPI, FileBackedKPI, Service, EntityThreshold import logging import math import pytz from pytz.exceptions import UnknownTimeZoneError import statistics import sys from io import StringIO import hashlib from ITOA.setup_logging import setup_logging from itsi.itsi_time_block_utils import PolicyFilter from SA_ITOA_app_common.solnlib.conf_manager import ConfManager ################## # itsiatutils ################## # Utility module for AT and outlier detection custom search commands. # Windows will mangle our line-endings unless we do this. if sys.platform == "win32": import os import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) msvcrt.setmode(sys.stderr.fileno(), os.O_BINARY) msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) outlier_logger = setup_logging("itsi_apply_at_outliers.log", "itsi.apply_at.outliers", level=logging.DEBUG) MIN_DATASET_LEN = 20 """ Factor used to scale down number of KPIs processed per batch in the itsibatch CSC. Limits the amount of KPI time series data needed to be passed to the applyat CSC by training window. """ AT_SCALE_DOWN_FACTORS = { '-7d': 1, '-14d': 2, '-30d': 4, '-60d': 8, } def log_and_warn(metadata, logger, msg, search_msg=None): search_msg = search_msg or msg logger.warn(msg) add_message(metadata, 'WARN', search_msg) def log_and_die(metadata, logger, msg, search_msg=None): logger.error(msg) die(metadata, msg, search_msg) def generate_at_search(kpi_ids, log_level='INFO'): """ Creates the search needed to run KPI level adaptive thresholding @type: list @param kpi_ids: the list of kpi_ids @type: string @param log_level: log_level for applyat command """ if not isinstance(kpi_ids, list) or len(kpi_ids) < 1: return '' itsi_kpi_ids = 'itsi_kpi_id IN (' + ', '.join(kpi_ids) + ')' return '| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE ' \ '`get_itsi_summary_metrics_index` AND ( ' + itsi_kpi_ids + ' ) AND is_filled_gap_event!=1 ' \ 'AND is_null_alert_value=0 `metrics_service_level_kpi_only` by itsi_kpi_id, ' \ 'itsi_service_id span=1m | where alert_level!=-2 | table _time, alert_value, alert_level, ' \ 'itsi_kpi_id, itsi_service_id | applyat log_level=' + log_level def generate_entity_at_search(entity_objects, log_level='INFO'): """ Creates the search needed to run entity level adaptive thresholding @type: list @param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id @type: string @param log_level: log_level for applyat command """ if not isinstance(entity_objects, list) or len(entity_objects) < 1: return '' kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key'] + "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects) return '| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE ' \ '`get_itsi_summary_metrics_index` AND ( ' + kpi_filter_string + ' ) AND is_filled_gap_event!=1 ' \ 'AND is_null_alert_value=0 AND `metrics_entity_level_kpi_only` by itsi_kpi_id, ' \ 'itsi_service_id, entity_key, entity_title span=1m | where alert_level!=-2 | table _time, alert_value, alert_level, ' \ 'itsi_kpi_id, itsi_service_id, entity_key, entity_title | applyat entitylevelthreshold log_level=' + log_level def divide_into_batches(ids, batch_size=1): """ Divides the ids in groups by batch size @type: list @param ids: the list of ids """ for i in range(0, len(ids), batch_size): yield ids[i: i + batch_size] def generate_searches(batches): """ Generate the SPL needed to run adaptive thresholding searches @type: list @param batches: the groups of ids ne """ for batch in batches: yield generate_at_search(batch) def quantile(data, q): """Naive implementation of linear-interpolated quantile. Comparable to numpy.percentile()/pd.DataFrame.quantile(). Author: Jacob Leverich (jleverich@splunk.com) """ assert q >= 0. and q <= 1. m = float(len(data) - 1) i = m * q ilow = math.floor(i) ihigh = math.ceil(i) if ilow == ihigh: return data[int(ilow)] f = (i - ilow) / (ihigh - ilow) low = data[int(ilow)] high = data[int(ihigh)] return low + f * (high - low) def quantiles(data, levels): # Remove nan's if any in the data data = [x for x in data if not math.isnan(x)] data = sorted(data) out = {x: quantile(data, float(x)) for x in levels} return out def parse_input_data(the_dict, data, fields_list, params): """ Populates the_dict with the values in data keyed by the fields in fields_list. @param the_dict: dict keyed by service_id and then by kpi_id into which we will write the data @param data: the incoming event data @param fields_list: list of strings containing the field names to be added as data to the appropriate list in the_dict @param params: Contains keys 'logger', 'use_kv_store', 'out_metadata', and 'kpi', the last of which contains 'service_id' and 'kpi_id' """ use_kv_store = params['use_kv_store'] logger = params['logger'] reader = csv.DictReader(data.splitlines(), dialect='excel') for record in reader: if 'itsi_service_id' not in record: if not use_kv_store: log_and_warn(metadata=params[ 'out_metadata'], logger=logger, msg="Missing Service ID: %s. Generating dummy value." % repr(record)) record['itsi_service_id'] = 'DEFAULT_SERVICE_ID' if 'itsi_kpi_id' not in record: if not use_kv_store: log_and_warn(metadata=params[ 'out_metadata'], logger=logger, msg="Missing KPI ID: %s. Generating dummy value." % repr(record)) record['itsi_kpi_id'] = 'DEFAULT_KPI_ID' if params['entity_level_thresholds'] and 'entity_title' not in record: if not use_kv_store: log_and_warn(metadata=params[ 'out_metadata'], logger=logger, msg="Missing Entity Title: %s. Generating dummy value." % repr(record)) record['entity_title'] = 'DEFAULT_ENTITY_TITLE' for f in fields_list: if record[f] == '' and f != 'itsi_service_id' and f != 'itsi_kpi_id' and f != 'entity_title' and f != 'entity_key': log_and_die( metadata=params['out_metadata'], logger=logger, msg="Missing field %s at time %s" % (str(f), str(record['_time']))) itsi_service_id = record['itsi_service_id'] itsi_kpi_id = record['itsi_kpi_id'] if itsi_service_id not in the_dict: the_dict[itsi_service_id] = dict() if itsi_kpi_id not in the_dict[itsi_service_id]: the_dict[itsi_service_id][itsi_kpi_id] = dict() if not params['entity_level_thresholds']: tmpdict = {} for f in fields_list: tmpdict[f] = list() the_dict[record['itsi_service_id']][record['itsi_kpi_id']] = tmpdict if params['entity_level_thresholds']: itsi_entity_key = record.get('entity_key', "N/A") if itsi_entity_key == "N/A": itsi_entity_key = hashlib.md5((record['entity_title'] + itsi_kpi_id).encode("utf-8")).hexdigest() params['pseudo_entities'].update({itsi_entity_key: record['entity_title']}) if itsi_entity_key not in the_dict[itsi_service_id][itsi_kpi_id]: tmpdict = {} for f in fields_list: tmpdict[f] = list() the_dict[record['itsi_service_id']][record['itsi_kpi_id']][itsi_entity_key] = tmpdict currentdict = the_dict[itsi_service_id][itsi_kpi_id][itsi_entity_key] else: currentdict = the_dict[itsi_service_id][itsi_kpi_id] for f in fields_list: currentdict[f].append(record[f]) def drop_dup(data, index): """Naive re-implementation of pd.DataFrame.drop_duplicates()""" out_data = {k: [] for k in list(data.keys())} last = None for i, v in enumerate(data[index]): if v != last: for k in list(data.keys()): out_data[k].append(data[k][i]) last = v return out_data def clean_values(data, params): """Non-pandas replacement for atad_utils.create_dataframe(). @param data: dict of '_time': list(epoch timestamp strings) 'alert_value': list(float strings) 'alert_period': list(float strings) optional? @param params: dict with keys 'logger' and 'out_metadata' """ logger = params['logger'] metadata = params['out_metadata'] values = dict(data) for i in range(len(values['_time'])): try: values['_time'][i] = float(values['_time'][i]) except ValueError: log_and_warn(metadata, logger, "Can't parse _time '%s' as float" % values['_time'][i]) values['_time'][i] = float('nan') # Drop duplicates values = drop_dup(values, '_time') for i in range(len(values['alert_value'])): try: values['alert_value'][i] = float(values['alert_value'][i]) except ValueError: log_and_warn(metadata, logger, "Can't parse alert_value '%s' as float" % values['alert_value'][i]) values['alert_value'][i] = float('nan') if 'alert_period' in values: for i in range(len(values['alert_period'])): try: values['alert_period'][i] = float(values['alert_period'][i]) except ValueError: log_and_warn(metadata, logger, "Can't parse alert_period '%s' as float" % values['alert_period'][i]) values['alert_period'][i] = float('nan') return values def get_service_object(params): service_object = None if params['use_kv_store'] and not params['use_temp_collection']: service_object = Service(logger=params['logger']) service_object.initialize_interface( params['session_key'], owner='nobody') return service_object def get_kpi_object(params): kpi_object = None if params['use_kv_store']: if params['use_temp_collection'] and params['temp_collection'] is not None and params['temp_key'] is not None: kpi_object = TempKPI(logger=params['logger'], temp_collection_name=params['temp_collection'], temp_object_key=params['temp_key']) else: kpi_object = ServiceKPI( logger=params['logger'], service_data=params['kpi']['service_data'], kpi_id=params['kpi']['kpi_id']) kpi_object.initialize_interface( params['session_key'], owner='nobody', namespace='SA-ITOA') kpi_object.fetch_kpi() params['logger'].debug( "Initialized KV interface with session key %s" % params['session_key']) elif params['settings_file'] is not None: kpi_object = FileBackedKPI( logger=params['logger'], filename=params['settings_file']) return kpi_object # Policy Class class Policy(object): def __init__(self, key, method, parameters, at_run_params, **kwargs): # validate methods and parameters if not isinstance(key, str): raise ValueError( "Null or non-string key sent to Policy constructor.") if not isinstance(method, str): raise ValueError( "Null or non-string method sent to Policy constructor. Must be a string: stdev, quantile, range, or percentage.") method_str = str(method) if method_str not in ['stdev', 'quantile', 'range', 'percentage']: raise ValueError( "Method must be one of stdev, quantile, range, or percentage.") if not parameters: # parameters is a list of theshold levels raise ValueError("Null parameters sent to Policy constructor.") if not isinstance(parameters, list) or len(parameters) > 10: raise ValueError( "Parameters must be a list of no more than 10 levels.", parameters) if not all('dynamicParam' in x for x in parameters): raise ValueError("Every level record must have a dynamicParam attribute") # store policies in form amenable to computing thresholds self.key = key self.method = method_str self.parameters = parameters self.title = kwargs.get('title', key) self.logger = kwargs.get('logger') self.at_run_params = at_run_params @property def parameter_values(self): # property that extracts dynamic param values from parameter list return [float(x['dynamicParam']) for x in self.parameters] def get_updated_levels(self, computed_thresholds, kpi_id, service_id): """ Returns a copy of the levels structure stored in self.parameters where thresholdValue field is updated from the computed levels array """ if len(computed_thresholds) != len(self.parameters): raise ValueError("Computed thresholds and stored thresholds structures are not of the same length") result = [] for computed_value, level in zip(computed_thresholds, self.parameters): level_copy = copy.copy(level) level_copy['thresholdValue'] = computed_value result.append(level_copy) self.logger.debug("Calculated thresholdLevels for policy %s of kpi %s and service %s are %s", self.key, kpi_id, service_id, result) return result # returns a copy of threshold levels structure with thresholdValue field updated def get_thresholds(self, values, kpi_dict): if self.method is None: raise UnboundLocalError("No method set for Policy.") data = {'alert_values': values} if len(values) < MIN_DATASET_LEN: self.logger.error("There are less than %s data points to calculate thresholds in policy: %s, values: %s" % (MIN_DATASET_LEN, self.key, values)) return None if kpi_dict['detect_outliers']: outlier_method = kpi_dict['outlier_detection_algo'] outlier_multiplier = kpi_dict['outlier_multiplier'] outliers, lower_bound, upper_bound = remove_outliers(data, outlier_method, outlier_multiplier) outlier_dict = { 'kpi_id': kpi_dict['kpi_id'], 'service_id': self.at_run_params['service_id'], 'policy_key': self.key, 'training_window': self.at_run_params['training_window'], 'at_run_epoch': self.at_run_params['at_run_epoch'], 'use_temp_collection': self.at_run_params['use_temp_collection'], 'method': outlier_method, 'multiplier': outlier_multiplier, 'count': len(outliers), 'lower_bound': lower_bound, 'upper_bound': upper_bound } # Write outliers metadata to outlier log. outlier_logger.info(outlier_dict) self.logger.info("KPI: %s, %s outliers identified and removed: %s" % (kpi_dict['kpi_id'], len(outliers), outliers)) filtered_values = data['alert_values'] D = {'alert_values': [v[0] for v in filtered_values if not math.isnan(v[0])]} if len(D['alert_values']) < MIN_DATASET_LEN: self.logger.error("There are less than %s data points in policy: %s, %s" % (MIN_DATASET_LEN, self.key, values)) return None if self.method == 'stdev': # pretty standard, really # Simple two-pass algorithm for calculating stdev. Reasonably numerically stable. mean = sum(D['alert_values']) / len(D['alert_values']) sqe = sum((x - mean) ** 2. for x in D['alert_values']) std = math.sqrt(sqe / (len(D['alert_values']) - 1)) if std == 0.0: # Very rare scenario when all the alert values are the same, # setting it to a non-zero value based on a heuristic. self.logger.info("STD evaluated as 0, setting it to a non-zero value.") std = mean * 0.001 + 0.001 # 1000th of the mean return self.get_updated_levels([mean + (std * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id']) # formerly iqr and same as "mass" in prior iterations elif self.method == 'quantile': T = quantiles(D['alert_values'], self.parameter_values) return self.get_updated_levels([T[k] for k in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id']) elif self.method == 'range': # equal width bands dmax = max(D['alert_values']) dmin = min(D['alert_values']) span = dmax - dmin return self.get_updated_levels([dmin + (span * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id']) elif self.method == 'percentage': # Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage mean = sum(D['alert_values']) / len(D['alert_values']) return self.get_updated_levels([mean * (1 + c / 100) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id']) else: raise ValueError("Invalid thresholding method: " + self.method) # Schedule Class class Schedule(object): # policies: dict of Policy Objects keyed by policy.key # schedule: dict of policy_keys keyed by block_keys def __init__(self, kpi_object, policies, threshold_spec, params): # validate kpi if kpi_object is None: raise ValueError("Null KPI object sent to Schedule constructor.") if not isinstance(kpi_object, KPIBase): raise ValueError("KPI parameter must be a kpi.KPI object") # validate policies if policies is None: raise ValueError("Null policy dict sent to Schedule constructor.") if not isinstance(policies, dict): raise ValueError( "Policies parameter must be a dict, got %s." % type(policies)) if len(policies) > 169 or len(policies) == 0: raise ValueError( "Policies parameter must be a dict of no more than 168 Policy objects, got %s." % len(policies)) if sum([1 if not isinstance(p, Policy) else 0 for p in list(policies.values())]) > 0: raise ValueError("All policies must be Policy objects.") self.logger = None if 'logger' in params: self.logger = params['logger'] self.kpi_object = kpi_object self.policies = policies self.filter = PolicyFilter(threshold_spec) def _get_thresholds(self, data, params): if data is None: raise ValueError("Null data sent to Schedule.") if not isinstance(data, dict) or 'alert_value' not in data: raise ValueError( "Data passed to Schedule must be a dict with values in column 'alert_values'." + str(data)) # divide data based on policy: D[policy_key] = [tuples] D = {} for policy_key in self.policies: D[policy_key] = [] index_converted = data['_time'] active_policies = set() for data_index in range(len(index_converted)): # If the apply_dst_to_at is enabled then shift add timestamp to the dst offset if not params["disable_dst_to_at"] and params["dst_change_timestamp"] > 0 and params["dst_offset"] != 0: # If _time is less than the last dst_change_timestamp then add it with the dst_offset if index_converted[data_index] < params["dst_change_timestamp"]: index_converted[data_index] = index_converted[data_index] + params["dst_offset"] # provide a timestamp and TZ, get the policy that includes this timestamp policy_key = self.filter.get_policy_key(time=index_converted[data_index]) if policy_key in D: D[policy_key].append((data['alert_value'][data_index], index_converted[data_index], False, 0, 0)) active_policies.add(policy_key) # compute and accumulate the thresholds for each Policy T = {} insufficient_data_policies = [] for policy_key in self.policies: the_data = D[policy_key] T[policy_key] = self.policies[policy_key].get_thresholds(the_data, params['kpi']) if T[policy_key] is None and policy_key in active_policies: insufficient_data_policies.append(self.policies[policy_key].title) self.logger.info( "Insufficient data for threshold calculation: %d values." % len(D[policy_key])) if len(insufficient_data_policies) > 0: add_message(params['out_metadata'], 'WARN', 'insufficient data in ITSI summary index for policies %s' % str(insufficient_data_policies)) return T def get_thresholds(self, data, params): """Computes thresholds for a KPI and this schedule. :param data: dict with 'alert_value': list of floats '_time': list of float epoch timestamps :param params: dict with kpi settings Returns a dict of lists of threshold level structures, keyed by policy.key; the structures should have a populated `thresholdValue` field obtained from the result of the computation """ metadata = params['out_metadata'] thresholds = {} kpi_info = 'kpiid="%s" on serviceid="%s"' % (str(params['kpi']['kpi_id']), str(params['kpi']['service_id'])) try: thresholds = self._get_thresholds(data=data, params=params) except ValueError: log_and_warn(metadata=metadata, logger=self.logger, msg='Unconvertible alert_values found for ' + kpi_info, search_msg="unconvertible values found (check this KPI's `alert_value` " "field in ITSI summary index") except AssertionError: # Method should probably raise a ValueError/try to convert 0-100 to 0.0-1.0, but for now log nicely log_and_warn(metadata=metadata, logger=self.logger, msg='Invalid quantile specified for %s, must be between 0.0 and 1.0' % kpi_info, search_msg='invalid quantile value, must be between 0.0 and 1.0') except Exception as e: log_and_warn(metadata=metadata, logger=self.logger, msg=str(e)) log_and_warn(metadata=metadata, logger=self.logger, msg='Unexpected exception when computing thresholds for %s' % kpi_info) return thresholds def create_schedule(params, entity_config=None): policies = {} metadata = params['out_metadata'] settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings'] logger = params['logger'] # get policy settings for this KPI, create Policy objects for policy_key in settings['policies']: t_method = str( settings['policies'][policy_key]['policy_type']) t_title = str(settings['policies'][policy_key].get('title', policy_key)) try: t_levels = settings['policies'][policy_key][params['threshold_key']]['thresholdLevels'] except KeyError as e: # we just skip this policy logger.exception(e) log_and_warn(metadata=metadata, logger=logger, msg="Failed to retrieve %ss: %s" % (params['threshold_key'], e)) continue policy_key = str(policy_key) if t_method == 'static': logger.info("Skipping static policy '%s'", policy_key) elif not isinstance(t_levels, list) or not t_levels: log_and_warn(metadata=metadata, logger=logger, msg="Unable to apply adaptive thresholding on policy '%s': please specify threshold values " "for the policy" % t_title) continue else: skip_policy = False for x in t_levels: if 'dynamicParam' not in x: log_and_warn(metadata=metadata, logger=logger, msg="Unable to apply adaptive thresholding on policy '%s': Missing threshold " "value." % t_title) skip_policy = True break try: float(x['dynamicParam']) except (TypeError, ValueError): log_and_warn(metadata=metadata, logger=logger, msg="Unable to apply adaptive thresholding on policy '%s': Invalid threshold " "value: %s" % (t_title, x['dynamicParam'])) skip_policy = True break if skip_policy: continue logger.debug("Loading settings for policy %s: method=%s levels=%s" % ( policy_key, t_method, t_levels)) try: at_run_params = {'at_run_epoch': params['at_run_epoch'], 'use_temp_collection': params['use_temp_collection'], 'service_id': params['kpi']['service_id'], 'training_window': params['kpi']['adaptive_thresholding_training_window']} policies[policy_key] = Policy( key=policy_key, method=t_method, parameters=t_levels, title=t_title, logger=logger, at_run_params=at_run_params) except ValueError as e: logger.exception(e) log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Policy.") the_schedule = None if len(policies) == 0: return try: the_schedule = Schedule( kpi_object=params['kpi']['kpi_object'], policies=policies, threshold_spec=settings, params=params) except ValueError as e: logger.exception(e) log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Schedule.") return the_schedule def output_results(at_command, params, thresholds, data, entity_config=None): """ thresholds: dict of lists of threshold levels structures, keyed by policy id """ settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings'] service_id = params['kpi']['service_id'] kpi_id = params['kpi']['kpi_id'] if not thresholds and not at_command: alerts_converted = data["alert_value"] time_converted = data["_time"] filter = PolicyFilter(settings) for index in range(len(time_converted)): try: alert_val = alerts_converted[index] time_val = time_converted[index] policy_key = filter.get_policy_key(time_val) line = { 'policy_key': policy_key, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id, 'alert_value': alert_val, '_time': time_val } if entity_config: line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']}) except IndexError: raise Exception(data) params['writer'].writerow(line) else: for policy_id in thresholds: t = thresholds[policy_id] if t is not None: if params['use_kv_store']: if len(t) != len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels']): kpistr = "" if service_id is not None and kpi_id is not None and service_id != "" and kpi_id != "": kpistr = " for kpi %s" % str(service_id) + ":" + str(kpi_id) found = len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels']) msg = "Mismatched number of thresholdLevels: %s. Generated %d but found %d." % ( kpistr, len(t), found) log_and_warn(metadata=params['out_metadata'], logger=params['logger'], msg=msg) else: # n.b. we assume thresholdLevels objects are # sorted by increasing thresholdValue # move this update_thresholds to outside if entity_config: params['entity_threshold_object'].update_thresholds( policy=policy_id, thresholds=t, entity=entity_config) else: params['kpi']['kpi_object'].update_thresholds( policy=policy_id, thresholds=t) line = { 'policy_id': policy_id, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id} if entity_config: line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']}) for thresh_index in range(len(t)): line['threshold_' + str(thresh_index)] = t[thresh_index].get('thresholdValue') line['threshold_metadata_' + str(thresh_index)] = t[thresh_index] params['writer'].writerow(line) return def ignore_invalid_row(warn_message, logger): """ Method to log warning and ignore read row result Assumes read_chunk was invoked before this method is invoked @type: basestring @param warn_message: warning message to log @rtype: None @return: None """ logger.warn(warn_message) # Dummy response to ignore write_chunk(sys.stdout, {"finished": False}, '') def gather_input_data(params, logger, fields_list): kpidict = dict() # kpidict['itsi_service_id']['itsi_kpi_id'] while True: params['out_metadata']['finished'] = False ret = read_chunk(sys.stdin, logger) if not ret: break metadata, body = ret parse_input_data( the_dict=kpidict, data=body, fields_list=fields_list, params=params) write_chunk(sys.stdout, params['out_metadata'], '') if metadata.get('finished', False): break params['kpidict'] = kpidict params['outbuf'] = StringIO() def last_dst_change_timestamp(logger, dst_timezone): """ Returns the last dst change timestamp for the provided timestamp @type logger: logger object @param logger: logger object @type: string @param dst_timezone: timezone provided by the user in itsi_settings.conf """ try: # Creates a time zone object based on the provided timezone_name try: tz = pytz.timezone(dst_timezone) except UnknownTimeZoneError as e: logger.exception(e) logger.error('Found Unknown timezone') if tz: # Gets the current time in the specified time zone. now = datetime.now(tz) # Gets a list of UTC transition times for the time zone. # These are the times at which the offset from UTC changes due to daylight # saving time changes or other reasons. # For example [ # datetime.datetime(2023, 3, 12, 10, 0), # datetime.datetime(2023, 11, 5, 9, 0), # datetime.datetime(2024, 3, 10, 10, 0), # datetime.datetime(2024, 11, 3, 9, 0), # datetime.datetime(2025, 3, 9, 10, 0), # datetime.datetime(2025, 11, 2, 9, 0) # ] transitions = list(tz._utc_transition_times) # Converts each UTC transition time to a datetime object in UTC time zone # [ # datetime.datetime(2023, 3, 12, 10, 0, tzinfo=), # datetime.datetime(2023, 11, 5, 9, 0, tzinfo=), # datetime.datetime(2024, 3, 10, 10, 0, tzinfo=), # datetime.datetime(2024, 11, 3, 9, 0, tzinfo=), # datetime.datetime(2025, 3, 9, 10, 0, tzinfo=), # datetime.datetime(2025, 11, 2, 9, 0, tzinfo=) # ] transitions = [pytz.utc.localize(transition) for transition in transitions] # Sorts the list of transition times in ascending order. transitions.sort() last_dst_change = None for transition_time in transitions: # If the transition time is before the current time, # it updates the last_dst_change variable to this transition time. if transition_time < now: last_dst_change = transition_time else: break if last_dst_change: # Converts the time object to a timestamp e.g. 1710064799 return round((last_dst_change - timedelta(seconds=1)).timestamp(), 2) else: logger.error('Failed to fetch last dst change timestamp') return None else: return None logger.error('Failed to fetch last dst change timestamp') except Exception as e: logger.exception(e) logger.error('Failed to fetch last dst change timestamp') return None def get_at_dst_changes_details(logger, session_key): """ Fetches the dst_changes details from apply_dst_to_at stanza from itsi_settings.conf and returns the required data to apply dst changes @type logger: logger object @param logger: logger object @type: string @param session_key: the splunkd session key for the request """ # Fetch data from apply_dst_to_at stanza of itsi_settings.conf cfm = ConfManager(session_key, 'SA-ITOA') conf = cfm.get_conf('itsi_settings') try: apply_dst_to_at = conf.get('apply_dst_to_at') disable_dst_to_at = int(apply_dst_to_at.get('disabled', 1)) dst_timezone = apply_dst_to_at.get('timezone', '') dst_offset = int(apply_dst_to_at.get('offset', 0)) except Exception as e: logger.exception(e) logger.error('Failed to fetch dst settings for the threshold calculation') days_since_dst = 0 dst_change_timestamp = 0 if(not disable_dst_to_at): if dst_timezone and dst_offset != 0: # Fetch the last dst change timestamp as per the provided timezone dst_change_timestamp = last_dst_change_timestamp(logger, dst_timezone) if dst_change_timestamp: # Calculate the number of days past dst based on timestamp last_dst_change_datetime = datetime.utcfromtimestamp(dst_change_timestamp) current_datetime = datetime.utcnow() days_since_dst = (current_datetime - last_dst_change_datetime).days # Disable apply_dst_to_at in itsi_settings.conf if more than 60 days are past since dst changes if days_since_dst > 60: try: conf.update('apply_dst_to_at', {'disabled': 1}) disable_dst_to_at = 1 except Exception as e: logger.exception(e) logger.error('Failed to update dst disabled settings for the threshold calculation') else: try: conf.update('apply_dst_to_at', {'disabled': 1}) disable_dst_to_at = 1 except Exception as e: logger.exception(e) logger.error('Failed to update dst disabled settings for the threshold calculation') logger.error('Could not find a timestamp for the provided timezone. Hence DST changes to AT will not be applied') else: try: conf.update('apply_dst_to_at', {'disabled': 1}) disable_dst_to_at = 1 except Exception as e: logger.exception(e) logger.error('Failed to update dst disabled settings for the threshold calculation') logger.error('Could not find a timezone or an offset to apply DST changes to AT. Hence DST changes to AT will not be applied') return disable_dst_to_at, dst_timezone, dst_change_timestamp, dst_offset def chunker(params, at_command=False): logger = params['logger'] kpidict = params['kpidict'] params['outbuf'] = StringIO() if at_command: fields_list = ['policy_id'] for k in range(10): fields_list.append("threshold_" + str(k)) fields_list.append("threshold_metadata_" + str(k)) fields_list = fields_list + ['itsi_service_id', 'itsi_kpi_id'] if params['entity_level_thresholds']: fields_list = fields_list + ['entity_key', 'entity_title'] else: fields_list = ['policy_key', 'itsi_service_id', 'itsi_kpi_id', 'alert_value', '_time'] # prepare for generating output params['out_metadata']['finished'] = False # Create a dict writer with IO params['writer'] = csv.DictWriter( params['outbuf'], fieldnames=fields_list, dialect='excel', extrasaction='ignore') params['writer'].writeheader() # Get the service object params['service_object'] = get_service_object(params) params['disable_dst_to_at'], params['dst_timezone'], params['dst_change_timestamp'], params['dst_offset'] = get_at_dst_changes_details(logger, session_key=params["session_key"]) # Bulk fetch the services of targeted kpis if params['service_object']: params['service_object'].bulk_fetch_service(kpidict.keys()) if params['entity_level_thresholds']: params['entity_threshold_object'] = EntityThreshold(logger=params['logger']) params['entity_threshold_object'].initialize_interface( params['session_key'], owner='nobody') if at_command: list_kpis = [] for itsi_service_id in kpidict: for itsi_kpi_id in kpidict[itsi_service_id]: list_kpis.append(itsi_kpi_id) # Get the Active Custom Threshold Windows which are of type percentage ctw_object = CustomThresholdWindow(logger=logger) ctw_object.initialize_interface( params['session_key'], owner='nobody') ctw_linked_kpis = ctw_object.bulk_fetch_active_ctw(list_kpis) # Phase 2: iterate over (serviceid, kpiid) and output scores for itsi_service_id in kpidict: params['kpi'] = { 'service_id': itsi_service_id, 'service_data': None } if params['service_object']: # save the service data params['kpi']['service_data'] = params['service_object'].fetch_service(itsi_service_id) for itsi_kpi_id in kpidict[itsi_service_id]: params['kpi']['kpi_id'] = itsi_kpi_id # get the KPI object params['kpi']['kpi_object'] = get_kpi_object(params) if params['kpi']['kpi_object'] is None: ignore_invalid_row('No KPI found with id %s, ignoring ...' % itsi_kpi_id, logger) continue # get the settings kpi_tmp = params['kpi']['kpi_object'].get_kpi() if not isinstance(kpi_tmp, dict): ignore_invalid_row('No valid KPI found with id %s, ignoring ...' % itsi_kpi_id, logger) continue if 'time_variate_thresholds_specification' not in kpi_tmp: ignore_invalid_row( 'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id, logger ) continue params['kpi']['entity_thresholds'] = {} if params['entity_level_thresholds']: list_entity_keys = kpidict[itsi_service_id][itsi_kpi_id].keys() entity_threshold_configs = params['entity_threshold_object'].bulk_fetch_configs(itsi_kpi_id, list_entity_keys, params['pseudo_entities']) if not entity_threshold_configs: break for entity_config in entity_threshold_configs: # Create temp Entity Key to store persistent entity config in a global object entity_key = entity_config.get("entity_key") if entity_config.get("entity_key") != 'N/A' else hashlib.md5(( entity_config['entity_title'] + entity_config['kpi_id']).encode("utf-8")).hexdigest() params['kpi']['entity_thresholds'].update({entity_key: entity_config}) params['kpi']['settings'] = kpi_tmp[ 'time_variate_thresholds_specification'] params['kpi']['detect_outliers'] = False params['kpi']['outlier_detection_algo'] = None params['kpi']['outlier_multiplier'] = None params['kpi']['adaptive_thresholding_training_window'] = kpi_tmp['adaptive_thresholding_training_window'] if 'aggregate_outlier_detection_enabled' in kpi_tmp: params['kpi']['detect_outliers'] = False if params['entity_level_thresholds'] else kpi_tmp['aggregate_outlier_detection_enabled'] if kpi_tmp['aggregate_outlier_detection_enabled']: if 'outlier_detection_algo' in kpi_tmp: params['kpi']['outlier_detection_algo'] = kpi_tmp['outlier_detection_algo'] if 'outlier_detection_sensitivity' in kpi_tmp: params['kpi']['outlier_multiplier'] = kpi_tmp['outlier_detection_sensitivity'] if at_command: if kpi_tmp['_key'] in ctw_linked_kpis and kpi_tmp['adaptive_thresholds_is_enabled']: kpi_tmp['recalculate_custom_thresholds'] = True if params['kpi']['settings'] is not None: if params['entity_level_thresholds']: for entity_key in kpidict[itsi_service_id][itsi_kpi_id]: # Ignore Entity from data if we don't have configuration available as kpi_entity_threshold if not params['kpi']['entity_thresholds'].get(entity_key): continue calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id][entity_key], entity_config=params['kpi']['entity_thresholds'][entity_key]) else: calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id], entity_config=None) else: ignore_invalid_row( 'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id, logger ) continue # Write output datain buffer write_chunk(sys.stdout, params['out_metadata'], params['outbuf'].getvalue()) def calculate_thresholds(at_command, params, data, entity_config=None): """ Calculate Thresholds from data dict @type: dict @param data: Data points dictionary to parse to get new threshold values @type: dict @param entity_config: Entity Level Configuration object. Only for Entity level thresholding """ if at_command: # create the schedule the_schedule = create_schedule(params=params, entity_config=entity_config) values = clean_values( data=data, params=params ) thresholds = None if at_command: # compute the thresholds if the_schedule is not None: thresholds = the_schedule.get_thresholds( data=values, params=params) else: thresholds = {} values['policy_key'] = [] # write output to buffer output_results(at_command=at_command, params=params, thresholds=thresholds, data=values, entity_config=entity_config) def remove_outliers(data, method, multiplier): if method is None: method = "stdev" if multiplier is None: multiplier = 2 return apply_outlier_algorithm(data, method, multiplier, remove=True) def apply_outlier_algorithm(data, method, multiplier, remove=False): if data is None or 'alert_values' not in data: raise Exception("Data is empty or not in correct format for applying outlier algorithm") list_of_tuples = data['alert_values'] arr_floats = [float(i[0]) for i in list_of_tuples] median = statistics.median(arr_floats) # Calculate bounds if method.lower() == 'mad': mad = statistics.median([abs(val - median) for val in arr_floats]) upper_bound = median + (float(multiplier) * mad) lower_bound = median - (float(multiplier) * mad) elif method.lower() == 'iqr': arr_floats = sorted(arr_floats) iqr = quantile(arr_floats, 0.75) - quantile(arr_floats, 0.25) upper_bound = median + (float(multiplier) * iqr) lower_bound = median - (float(multiplier) * iqr) elif method.lower() == 'stdev': mean = statistics.mean(arr_floats) stdev = statistics.stdev(arr_floats) upper_bound = mean + (float(multiplier) * stdev) lower_bound = mean - (float(multiplier) * stdev) else: raise Exception("Unsupported outlier detection method: %s" % method) updated_values = [] outliers = [] # Iterate over data and mark outliers for x in list_of_tuples: x_list = list(x) try: x_list[3] = lower_bound x_list[4] = upper_bound except IndexError: raise Exception(x_list) if float(x[0]) > upper_bound or float(x[0]) < lower_bound: x_list[2] = True outliers.append(tuple(x_list)) if remove: continue updated_values.append(tuple(x_list)) data['alert_values'] = updated_values return outliers, lower_bound, upper_bound def detect_outliers(params, policy_chunks): """ Updates policy_chunks dictionary with outliers detected based on the method chosen also returns outliers in a separate dictionary """ policy_outlier_map = collections.OrderedDict() logger = params['logger'] # Identify outliers per policy block for k, v in policy_chunks.items(): outliers = [] data = {'alert_values': v} outliers, _, _ = apply_outlier_algorithm(data, params['method'].lower(), params['multiplier']) logger.debug("%s outliers identified for method: %s, multiplier: %s, outliers: %s" % (len(outliers), params['method'], params['multiplier'], outliers)) policy_outlier_map[k] = outliers policy_chunks[k] = data['alert_values'] return policy_outlier_map