You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1066 lines
46 KiB

# Copyright (C) 2005-2024 Splunk Inc. All Rights Reserved.
import csv
import copy
import collections
from .custom_threshold_window import CustomThresholdWindow
from .chunked_util import die, add_message, read_chunk, write_chunk
from datetime import datetime, timedelta
from .kpi import KPIBase, ServiceKPI, TempKPI, FileBackedKPI, Service, EntityThreshold
import logging
import math
import pytz
from pytz.exceptions import UnknownTimeZoneError
import statistics
import sys
from io import StringIO
import hashlib
from ITOA.setup_logging import setup_logging
from itsi.itsi_time_block_utils import PolicyFilter
from SA_ITOA_app_common.solnlib.conf_manager import ConfManager
##################
# itsiatutils
##################
# Utility module for AT and outlier detection custom search commands.
# Windows will mangle our line-endings unless we do this.
if sys.platform == "win32":
import os
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stderr.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
outlier_logger = setup_logging("itsi_apply_at_outliers.log", "itsi.apply_at.outliers", level=logging.DEBUG)
MIN_DATASET_LEN = 20
"""
Factor used to scale down number of KPIs processed per batch in the itsibatch CSC.
Limits the amount of KPI time series data needed to be passed to the applyat CSC by
training window.
"""
AT_SCALE_DOWN_FACTORS = {
'-7d': 1,
'-14d': 2,
'-30d': 4,
'-60d': 8,
}
def log_and_warn(metadata, logger, msg, search_msg=None):
search_msg = search_msg or msg
logger.warn(msg)
add_message(metadata, 'WARN', search_msg)
def log_and_die(metadata, logger, msg, search_msg=None):
logger.error(msg)
die(metadata, msg, search_msg)
def generate_at_search(kpi_ids, log_level='INFO'):
"""
Creates the search needed to run KPI level adaptive thresholding
@type: list
@param kpi_ids: the list of kpi_ids
@type: string
@param log_level: log_level for applyat command
"""
if not isinstance(kpi_ids, list) or len(kpi_ids) < 1:
return ''
itsi_kpi_ids = 'itsi_kpi_id IN (' + ', '.join(kpi_ids) + ')'
return '| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE ' \
'`get_itsi_summary_metrics_index` AND ( ' + itsi_kpi_ids + ' ) AND is_filled_gap_event!=1 ' \
'AND is_null_alert_value=0 `metrics_service_level_kpi_only` by itsi_kpi_id, ' \
'itsi_service_id span=1m | where alert_level!=-2 | table _time, alert_value, alert_level, ' \
'itsi_kpi_id, itsi_service_id | applyat log_level=' + log_level
def generate_entity_at_search(entity_objects, log_level='INFO'):
"""
Creates the search needed to run entity level adaptive thresholding
@type: list
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
@type: string
@param log_level: log_level for applyat command
"""
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
return ''
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key'] + "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
return '| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE ' \
'`get_itsi_summary_metrics_index` AND ( ' + kpi_filter_string + ' ) AND is_filled_gap_event!=1 ' \
'AND is_null_alert_value=0 AND `metrics_entity_level_kpi_only` by itsi_kpi_id, ' \
'itsi_service_id, entity_key, entity_title span=1m | where alert_level!=-2 | table _time, alert_value, alert_level, ' \
'itsi_kpi_id, itsi_service_id, entity_key, entity_title | applyat entitylevelthreshold log_level=' + log_level
def divide_into_batches(ids, batch_size=1):
"""
Divides the ids in groups by batch size
@type: list
@param ids: the list of ids
"""
for i in range(0, len(ids), batch_size):
yield ids[i: i + batch_size]
def generate_searches(batches):
"""
Generate the SPL needed to run adaptive thresholding searches
@type: list
@param batches: the groups of ids ne
"""
for batch in batches:
yield generate_at_search(batch)
def quantile(data, q):
"""Naive implementation of linear-interpolated quantile.
Comparable to numpy.percentile()/pd.DataFrame.quantile().
Author: Jacob Leverich (jleverich@splunk.com)
"""
assert q >= 0. and q <= 1.
m = float(len(data) - 1)
i = m * q
ilow = math.floor(i)
ihigh = math.ceil(i)
if ilow == ihigh:
return data[int(ilow)]
f = (i - ilow) / (ihigh - ilow)
low = data[int(ilow)]
high = data[int(ihigh)]
return low + f * (high - low)
def quantiles(data, levels):
# Remove nan's if any in the data
data = [x for x in data if not math.isnan(x)]
data = sorted(data)
out = {x: quantile(data, float(x)) for x in levels}
return out
def parse_input_data(the_dict, data, fields_list, params):
"""
Populates the_dict with the values in data keyed by the fields in fields_list.
@param the_dict: dict keyed by service_id and then by kpi_id into which we will write the data
@param data: the incoming event data
@param fields_list: list of strings containing the field names to be added as data to the appropriate list in the_dict
@param params: Contains keys 'logger', 'use_kv_store', 'out_metadata', and 'kpi', the last of which contains 'service_id' and 'kpi_id'
"""
use_kv_store = params['use_kv_store']
logger = params['logger']
reader = csv.DictReader(data.splitlines(), dialect='excel')
for record in reader:
if 'itsi_service_id' not in record:
if not use_kv_store:
log_and_warn(metadata=params[
'out_metadata'], logger=logger, msg="Missing Service ID: %s. Generating dummy value." % repr(record))
record['itsi_service_id'] = 'DEFAULT_SERVICE_ID'
if 'itsi_kpi_id' not in record:
if not use_kv_store:
log_and_warn(metadata=params[
'out_metadata'], logger=logger, msg="Missing KPI ID: %s. Generating dummy value." % repr(record))
record['itsi_kpi_id'] = 'DEFAULT_KPI_ID'
if params['entity_level_thresholds'] and 'entity_title' not in record:
if not use_kv_store:
log_and_warn(metadata=params[
'out_metadata'], logger=logger, msg="Missing Entity Title: %s. Generating dummy value." % repr(record))
record['entity_title'] = 'DEFAULT_ENTITY_TITLE'
for f in fields_list:
if record[f] == '' and f != 'itsi_service_id' and f != 'itsi_kpi_id' and f != 'entity_title' and f != 'entity_key':
log_and_die(
metadata=params['out_metadata'], logger=logger, msg="Missing field %s at time %s" % (str(f), str(record['_time'])))
itsi_service_id = record['itsi_service_id']
itsi_kpi_id = record['itsi_kpi_id']
if itsi_service_id not in the_dict:
the_dict[itsi_service_id] = dict()
if itsi_kpi_id not in the_dict[itsi_service_id]:
the_dict[itsi_service_id][itsi_kpi_id] = dict()
if not params['entity_level_thresholds']:
tmpdict = {}
for f in fields_list:
tmpdict[f] = list()
the_dict[record['itsi_service_id']][record['itsi_kpi_id']] = tmpdict
if params['entity_level_thresholds']:
itsi_entity_key = record.get('entity_key', "N/A")
if itsi_entity_key == "N/A":
itsi_entity_key = hashlib.md5((record['entity_title'] + itsi_kpi_id).encode("utf-8")).hexdigest()
params['pseudo_entities'].update({itsi_entity_key: record['entity_title']})
if itsi_entity_key not in the_dict[itsi_service_id][itsi_kpi_id]:
tmpdict = {}
for f in fields_list:
tmpdict[f] = list()
the_dict[record['itsi_service_id']][record['itsi_kpi_id']][itsi_entity_key] = tmpdict
currentdict = the_dict[itsi_service_id][itsi_kpi_id][itsi_entity_key]
else:
currentdict = the_dict[itsi_service_id][itsi_kpi_id]
for f in fields_list:
currentdict[f].append(record[f])
def drop_dup(data, index):
"""Naive re-implementation of pd.DataFrame.drop_duplicates()"""
out_data = {k: [] for k in list(data.keys())}
last = None
for i, v in enumerate(data[index]):
if v != last:
for k in list(data.keys()):
out_data[k].append(data[k][i])
last = v
return out_data
def clean_values(data, params):
"""Non-pandas replacement for atad_utils.create_dataframe().
@param data: dict of '_time': list(epoch timestamp strings)
'alert_value': list(float strings)
'alert_period': list(float strings) optional?
@param params: dict with keys 'logger' and 'out_metadata'
"""
logger = params['logger']
metadata = params['out_metadata']
values = dict(data)
for i in range(len(values['_time'])):
try:
values['_time'][i] = float(values['_time'][i])
except ValueError:
log_and_warn(metadata, logger, "Can't parse _time '%s' as float" % values['_time'][i])
values['_time'][i] = float('nan')
# Drop duplicates
values = drop_dup(values, '_time')
for i in range(len(values['alert_value'])):
try:
values['alert_value'][i] = float(values['alert_value'][i])
except ValueError:
log_and_warn(metadata, logger, "Can't parse alert_value '%s' as float" % values['alert_value'][i])
values['alert_value'][i] = float('nan')
if 'alert_period' in values:
for i in range(len(values['alert_period'])):
try:
values['alert_period'][i] = float(values['alert_period'][i])
except ValueError:
log_and_warn(metadata, logger, "Can't parse alert_period '%s' as float" % values['alert_period'][i])
values['alert_period'][i] = float('nan')
return values
def get_service_object(params):
service_object = None
if params['use_kv_store'] and not params['use_temp_collection']:
service_object = Service(logger=params['logger'])
service_object.initialize_interface(
params['session_key'], owner='nobody')
return service_object
def get_kpi_object(params):
kpi_object = None
if params['use_kv_store']:
if params['use_temp_collection'] and params['temp_collection'] is not None and params['temp_key'] is not None:
kpi_object = TempKPI(logger=params['logger'], temp_collection_name=params['temp_collection'], temp_object_key=params['temp_key'])
else:
kpi_object = ServiceKPI(
logger=params['logger'], service_data=params['kpi']['service_data'], kpi_id=params['kpi']['kpi_id'])
kpi_object.initialize_interface(
params['session_key'], owner='nobody', namespace='SA-ITOA')
kpi_object.fetch_kpi()
params['logger'].debug(
"Initialized KV interface with session key %s" % params['session_key'])
elif params['settings_file'] is not None:
kpi_object = FileBackedKPI(
logger=params['logger'], filename=params['settings_file'])
return kpi_object
# Policy Class
class Policy(object):
def __init__(self, key, method, parameters, at_run_params, **kwargs):
# validate methods and parameters
if not isinstance(key, str):
raise ValueError(
"Null or non-string key sent to Policy constructor.")
if not isinstance(method, str):
raise ValueError(
"Null or non-string method sent to Policy constructor. Must be a string: stdev, quantile, range, or percentage.")
method_str = str(method)
if method_str not in ['stdev', 'quantile', 'range', 'percentage']:
raise ValueError(
"Method must be one of stdev, quantile, range, or percentage.")
if not parameters: # parameters is a list of theshold levels
raise ValueError("Null parameters sent to Policy constructor.")
if not isinstance(parameters, list) or len(parameters) > 10:
raise ValueError(
"Parameters must be a list of no more than 10 levels.", parameters)
if not all('dynamicParam' in x for x in parameters):
raise ValueError("Every level record must have a dynamicParam attribute")
# store policies in form amenable to computing thresholds
self.key = key
self.method = method_str
self.parameters = parameters
self.title = kwargs.get('title', key)
self.logger = kwargs.get('logger')
self.at_run_params = at_run_params
@property
def parameter_values(self):
# property that extracts dynamic param values from parameter list
return [float(x['dynamicParam']) for x in self.parameters]
def get_updated_levels(self, computed_thresholds, kpi_id, service_id):
"""
Returns a copy of the levels structure stored in self.parameters
where thresholdValue field is updated from the computed levels array
"""
if len(computed_thresholds) != len(self.parameters):
raise ValueError("Computed thresholds and stored thresholds structures are not of the same length")
result = []
for computed_value, level in zip(computed_thresholds, self.parameters):
level_copy = copy.copy(level)
level_copy['thresholdValue'] = computed_value
result.append(level_copy)
self.logger.debug("Calculated thresholdLevels for policy %s of kpi %s and service %s are %s", self.key, kpi_id, service_id, result)
return result
# returns a copy of threshold levels structure with thresholdValue field updated
def get_thresholds(self, values, kpi_dict):
if self.method is None:
raise UnboundLocalError("No method set for Policy.")
data = {'alert_values': values}
if len(values) < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points to calculate thresholds in policy: %s, values: %s" % (MIN_DATASET_LEN, self.key, values))
return None
if kpi_dict['detect_outliers']:
outlier_method = kpi_dict['outlier_detection_algo']
outlier_multiplier = kpi_dict['outlier_multiplier']
outliers, lower_bound, upper_bound = remove_outliers(data, outlier_method, outlier_multiplier)
outlier_dict = {
'kpi_id': kpi_dict['kpi_id'],
'service_id': self.at_run_params['service_id'],
'policy_key': self.key,
'training_window': self.at_run_params['training_window'],
'at_run_epoch': self.at_run_params['at_run_epoch'],
'use_temp_collection': self.at_run_params['use_temp_collection'],
'method': outlier_method,
'multiplier': outlier_multiplier,
'count': len(outliers),
'lower_bound': lower_bound,
'upper_bound': upper_bound
}
# Write outliers metadata to outlier log.
outlier_logger.info(outlier_dict)
self.logger.info("KPI: %s, %s outliers identified and removed: %s" % (kpi_dict['kpi_id'], len(outliers), outliers))
filtered_values = data['alert_values']
D = {'alert_values': [v[0] for v in filtered_values if not math.isnan(v[0])]}
if len(D['alert_values']) < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points in policy: %s, %s" % (MIN_DATASET_LEN, self.key, values))
return None
if self.method == 'stdev': # pretty standard, really
# Simple two-pass algorithm for calculating stdev. Reasonably numerically stable.
mean = sum(D['alert_values']) / len(D['alert_values'])
sqe = sum((x - mean) ** 2. for x in D['alert_values'])
std = math.sqrt(sqe / (len(D['alert_values']) - 1))
if std == 0.0:
# Very rare scenario when all the alert values are the same,
# setting it to a non-zero value based on a heuristic.
self.logger.info("STD evaluated as 0, setting it to a non-zero value.")
std = mean * 0.001 + 0.001 # 1000th of the mean
return self.get_updated_levels([mean + (std * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
# formerly iqr and same as "mass" in prior iterations
elif self.method == 'quantile':
T = quantiles(D['alert_values'], self.parameter_values)
return self.get_updated_levels([T[k] for k in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
elif self.method == 'range': # equal width bands
dmax = max(D['alert_values'])
dmin = min(D['alert_values'])
span = dmax - dmin
return self.get_updated_levels([dmin + (span * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
elif self.method == 'percentage':
# Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage
mean = sum(D['alert_values']) / len(D['alert_values'])
return self.get_updated_levels([mean * (1 + c / 100) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
else:
raise ValueError("Invalid thresholding method: " + self.method)
# Schedule Class
class Schedule(object):
# policies: dict of Policy Objects keyed by policy.key
# schedule: dict of policy_keys keyed by block_keys
def __init__(self, kpi_object, policies, threshold_spec, params):
# validate kpi
if kpi_object is None:
raise ValueError("Null KPI object sent to Schedule constructor.")
if not isinstance(kpi_object, KPIBase):
raise ValueError("KPI parameter must be a kpi.KPI object")
# validate policies
if policies is None:
raise ValueError("Null policy dict sent to Schedule constructor.")
if not isinstance(policies, dict):
raise ValueError(
"Policies parameter must be a dict, got %s." % type(policies))
if len(policies) > 169 or len(policies) == 0:
raise ValueError(
"Policies parameter must be a dict of no more than 168 Policy objects, got %s." % len(policies))
if sum([1 if not isinstance(p, Policy) else 0 for p in list(policies.values())]) > 0:
raise ValueError("All policies must be Policy objects.")
self.logger = None
if 'logger' in params:
self.logger = params['logger']
self.kpi_object = kpi_object
self.policies = policies
self.filter = PolicyFilter(threshold_spec)
def _get_thresholds(self, data, params):
if data is None:
raise ValueError("Null data sent to Schedule.")
if not isinstance(data, dict) or 'alert_value' not in data:
raise ValueError(
"Data passed to Schedule must be a dict with values in column 'alert_values'." + str(data))
# divide data based on policy: D[policy_key] = [tuples]
D = {}
for policy_key in self.policies:
D[policy_key] = []
index_converted = data['_time']
active_policies = set()
for data_index in range(len(index_converted)):
# If the apply_dst_to_at is enabled then shift add timestamp to the dst offset
if not params["disable_dst_to_at"] and params["dst_change_timestamp"] > 0 and params["dst_offset"] != 0:
# If _time is less than the last dst_change_timestamp then add it with the dst_offset
if index_converted[data_index] < params["dst_change_timestamp"]:
index_converted[data_index] = index_converted[data_index] + params["dst_offset"]
# provide a timestamp and TZ, get the policy that includes this timestamp
policy_key = self.filter.get_policy_key(time=index_converted[data_index])
if policy_key in D:
D[policy_key].append((data['alert_value'][data_index], index_converted[data_index], False, 0, 0))
active_policies.add(policy_key)
# compute and accumulate the thresholds for each Policy
T = {}
insufficient_data_policies = []
for policy_key in self.policies:
the_data = D[policy_key]
T[policy_key] = self.policies[policy_key].get_thresholds(the_data, params['kpi'])
if T[policy_key] is None and policy_key in active_policies:
insufficient_data_policies.append(self.policies[policy_key].title)
self.logger.info(
"Insufficient data for threshold calculation: %d values." % len(D[policy_key]))
if len(insufficient_data_policies) > 0:
add_message(params['out_metadata'], 'WARN',
'insufficient data in ITSI summary index for policies %s' % str(insufficient_data_policies))
return T
def get_thresholds(self, data, params):
"""Computes thresholds for a KPI and this schedule.
:param data: dict with 'alert_value': list of floats
'_time': list of float epoch timestamps
:param params: dict with kpi settings
Returns a dict of lists of threshold level structures, keyed by policy.key;
the structures should have a populated `thresholdValue` field obtained from the result of the computation
"""
metadata = params['out_metadata']
thresholds = {}
kpi_info = 'kpiid="%s" on serviceid="%s"' % (str(params['kpi']['kpi_id']), str(params['kpi']['service_id']))
try:
thresholds = self._get_thresholds(data=data, params=params)
except ValueError:
log_and_warn(metadata=metadata, logger=self.logger,
msg='Unconvertible alert_values found for ' + kpi_info,
search_msg="unconvertible values found (check this KPI's `alert_value` "
"field in ITSI summary index")
except AssertionError:
# Method should probably raise a ValueError/try to convert 0-100 to 0.0-1.0, but for now log nicely
log_and_warn(metadata=metadata, logger=self.logger,
msg='Invalid quantile specified for %s, must be between 0.0 and 1.0' % kpi_info,
search_msg='invalid quantile value, must be between 0.0 and 1.0')
except Exception as e:
log_and_warn(metadata=metadata, logger=self.logger, msg=str(e))
log_and_warn(metadata=metadata, logger=self.logger,
msg='Unexpected exception when computing thresholds for %s' % kpi_info)
return thresholds
def create_schedule(params, entity_config=None):
policies = {}
metadata = params['out_metadata']
settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings']
logger = params['logger']
# get policy settings for this KPI, create Policy objects
for policy_key in settings['policies']:
t_method = str(
settings['policies'][policy_key]['policy_type'])
t_title = str(settings['policies'][policy_key].get('title', policy_key))
try:
t_levels = settings['policies'][policy_key][params['threshold_key']]['thresholdLevels']
except KeyError as e:
# we just skip this policy
logger.exception(e)
log_and_warn(metadata=metadata, logger=logger, msg="Failed to retrieve %ss: %s" % (params['threshold_key'], e))
continue
policy_key = str(policy_key)
if t_method == 'static':
logger.info("Skipping static policy '%s'", policy_key)
elif not isinstance(t_levels, list) or not t_levels:
log_and_warn(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': please specify threshold values "
"for the policy" % t_title)
continue
else:
skip_policy = False
for x in t_levels:
if 'dynamicParam' not in x:
log_and_warn(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': Missing threshold "
"value." % t_title)
skip_policy = True
break
try:
float(x['dynamicParam'])
except (TypeError, ValueError):
log_and_warn(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': Invalid threshold "
"value: %s" % (t_title, x['dynamicParam']))
skip_policy = True
break
if skip_policy:
continue
logger.debug("Loading settings for policy %s: method=%s levels=%s" % (
policy_key, t_method, t_levels))
try:
at_run_params = {'at_run_epoch': params['at_run_epoch'],
'use_temp_collection': params['use_temp_collection'],
'service_id': params['kpi']['service_id'],
'training_window': params['kpi']['adaptive_thresholding_training_window']}
policies[policy_key] = Policy(
key=policy_key, method=t_method, parameters=t_levels, title=t_title, logger=logger, at_run_params=at_run_params)
except ValueError as e:
logger.exception(e)
log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Policy.")
the_schedule = None
if len(policies) == 0:
return
try:
the_schedule = Schedule(
kpi_object=params['kpi']['kpi_object'], policies=policies, threshold_spec=settings, params=params)
except ValueError as e:
logger.exception(e)
log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Schedule.")
return the_schedule
def output_results(at_command, params, thresholds, data, entity_config=None):
"""
thresholds: dict of lists of threshold levels structures, keyed by policy id
"""
settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings']
service_id = params['kpi']['service_id']
kpi_id = params['kpi']['kpi_id']
if not thresholds and not at_command:
alerts_converted = data["alert_value"]
time_converted = data["_time"]
filter = PolicyFilter(settings)
for index in range(len(time_converted)):
try:
alert_val = alerts_converted[index]
time_val = time_converted[index]
policy_key = filter.get_policy_key(time_val)
line = {
'policy_key': policy_key, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id,
'alert_value': alert_val, '_time': time_val
}
if entity_config:
line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']})
except IndexError:
raise Exception(data)
params['writer'].writerow(line)
else:
for policy_id in thresholds:
t = thresholds[policy_id]
if t is not None:
if params['use_kv_store']:
if len(t) != len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels']):
kpistr = ""
if service_id is not None and kpi_id is not None and service_id != "" and kpi_id != "":
kpistr = " for kpi %s" % str(service_id) + ":" + str(kpi_id)
found = len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels'])
msg = "Mismatched number of thresholdLevels: %s. Generated %d but found %d." % (
kpistr, len(t), found)
log_and_warn(metadata=params['out_metadata'], logger=params['logger'], msg=msg)
else:
# n.b. we assume thresholdLevels objects are
# sorted by increasing thresholdValue
# move this update_thresholds to outside
if entity_config:
params['entity_threshold_object'].update_thresholds(
policy=policy_id, thresholds=t, entity=entity_config)
else:
params['kpi']['kpi_object'].update_thresholds(
policy=policy_id, thresholds=t)
line = {
'policy_id': policy_id, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id}
if entity_config:
line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']})
for thresh_index in range(len(t)):
line['threshold_' + str(thresh_index)] = t[thresh_index].get('thresholdValue')
line['threshold_metadata_' + str(thresh_index)] = t[thresh_index]
params['writer'].writerow(line)
return
def ignore_invalid_row(warn_message, logger):
"""
Method to log warning and ignore read row result
Assumes read_chunk was invoked before this method is invoked
@type: basestring
@param warn_message: warning message to log
@rtype: None
@return: None
"""
logger.warn(warn_message)
# Dummy response to ignore
write_chunk(sys.stdout, {"finished": False}, '')
def gather_input_data(params, logger, fields_list):
kpidict = dict() # kpidict['itsi_service_id']['itsi_kpi_id']
while True:
params['out_metadata']['finished'] = False
ret = read_chunk(sys.stdin, logger)
if not ret:
break
metadata, body = ret
parse_input_data(
the_dict=kpidict, data=body, fields_list=fields_list, params=params)
write_chunk(sys.stdout, params['out_metadata'], '')
if metadata.get('finished', False):
break
params['kpidict'] = kpidict
params['outbuf'] = StringIO()
def last_dst_change_timestamp(logger, dst_timezone):
"""
Returns the last dst change timestamp for the provided timestamp
@type logger: logger object
@param logger: logger object
@type: string
@param dst_timezone: timezone provided by the user in itsi_settings.conf
"""
try:
# Creates a time zone object based on the provided timezone_name
try:
tz = pytz.timezone(dst_timezone)
except UnknownTimeZoneError as e:
logger.exception(e)
logger.error('Found Unknown timezone')
if tz:
# Gets the current time in the specified time zone.
now = datetime.now(tz)
# Gets a list of UTC transition times for the time zone.
# These are the times at which the offset from UTC changes due to daylight
# saving time changes or other reasons.
# For example [
# datetime.datetime(2023, 3, 12, 10, 0),
# datetime.datetime(2023, 11, 5, 9, 0),
# datetime.datetime(2024, 3, 10, 10, 0),
# datetime.datetime(2024, 11, 3, 9, 0),
# datetime.datetime(2025, 3, 9, 10, 0),
# datetime.datetime(2025, 11, 2, 9, 0)
# ]
transitions = list(tz._utc_transition_times)
# Converts each UTC transition time to a datetime object in UTC time zone
# [
# datetime.datetime(2023, 3, 12, 10, 0, tzinfo=<UTC>),
# datetime.datetime(2023, 11, 5, 9, 0, tzinfo=<UTC>),
# datetime.datetime(2024, 3, 10, 10, 0, tzinfo=<UTC>),
# datetime.datetime(2024, 11, 3, 9, 0, tzinfo=<UTC>),
# datetime.datetime(2025, 3, 9, 10, 0, tzinfo=<UTC>),
# datetime.datetime(2025, 11, 2, 9, 0, tzinfo=<UTC>)
# ]
transitions = [pytz.utc.localize(transition) for transition in transitions]
# Sorts the list of transition times in ascending order.
transitions.sort()
last_dst_change = None
for transition_time in transitions:
# If the transition time is before the current time,
# it updates the last_dst_change variable to this transition time.
if transition_time < now:
last_dst_change = transition_time
else:
break
if last_dst_change:
# Converts the time object to a timestamp e.g. 1710064799
return round((last_dst_change - timedelta(seconds=1)).timestamp(), 2)
else:
logger.error('Failed to fetch last dst change timestamp')
return None
else:
return None
logger.error('Failed to fetch last dst change timestamp')
except Exception as e:
logger.exception(e)
logger.error('Failed to fetch last dst change timestamp')
return None
def get_at_dst_changes_details(logger, session_key):
"""
Fetches the dst_changes details from apply_dst_to_at stanza from itsi_settings.conf
and returns the required data to apply dst changes
@type logger: logger object
@param logger: logger object
@type: string
@param session_key: the splunkd session key for the request
"""
# Fetch data from apply_dst_to_at stanza of itsi_settings.conf
cfm = ConfManager(session_key, 'SA-ITOA')
conf = cfm.get_conf('itsi_settings')
try:
apply_dst_to_at = conf.get('apply_dst_to_at')
disable_dst_to_at = int(apply_dst_to_at.get('disabled', 1))
dst_timezone = apply_dst_to_at.get('timezone', '')
dst_offset = int(apply_dst_to_at.get('offset', 0))
except Exception as e:
logger.exception(e)
logger.error('Failed to fetch dst settings for the threshold calculation')
days_since_dst = 0
dst_change_timestamp = 0
if(not disable_dst_to_at):
if dst_timezone and dst_offset != 0:
# Fetch the last dst change timestamp as per the provided timezone
dst_change_timestamp = last_dst_change_timestamp(logger, dst_timezone)
if dst_change_timestamp:
# Calculate the number of days past dst based on timestamp
last_dst_change_datetime = datetime.utcfromtimestamp(dst_change_timestamp)
current_datetime = datetime.utcnow()
days_since_dst = (current_datetime - last_dst_change_datetime).days
# Disable apply_dst_to_at in itsi_settings.conf if more than 60 days are past since dst changes
if days_since_dst > 60:
try:
conf.update('apply_dst_to_at', {'disabled': 1})
disable_dst_to_at = 1
except Exception as e:
logger.exception(e)
logger.error('Failed to update dst disabled settings for the threshold calculation')
else:
try:
conf.update('apply_dst_to_at', {'disabled': 1})
disable_dst_to_at = 1
except Exception as e:
logger.exception(e)
logger.error('Failed to update dst disabled settings for the threshold calculation')
logger.error('Could not find a timestamp for the provided timezone. Hence DST changes to AT will not be applied')
else:
try:
conf.update('apply_dst_to_at', {'disabled': 1})
disable_dst_to_at = 1
except Exception as e:
logger.exception(e)
logger.error('Failed to update dst disabled settings for the threshold calculation')
logger.error('Could not find a timezone or an offset to apply DST changes to AT. Hence DST changes to AT will not be applied')
return disable_dst_to_at, dst_timezone, dst_change_timestamp, dst_offset
def chunker(params, at_command=False):
logger = params['logger']
kpidict = params['kpidict']
params['outbuf'] = StringIO()
if at_command:
fields_list = ['policy_id']
for k in range(10):
fields_list.append("threshold_" + str(k))
fields_list.append("threshold_metadata_" + str(k))
fields_list = fields_list + ['itsi_service_id', 'itsi_kpi_id']
if params['entity_level_thresholds']:
fields_list = fields_list + ['entity_key', 'entity_title']
else:
fields_list = ['policy_key', 'itsi_service_id', 'itsi_kpi_id', 'alert_value', '_time']
# prepare for generating output
params['out_metadata']['finished'] = False
# Create a dict writer with IO
params['writer'] = csv.DictWriter(
params['outbuf'], fieldnames=fields_list,
dialect='excel', extrasaction='ignore')
params['writer'].writeheader()
# Get the service object
params['service_object'] = get_service_object(params)
params['disable_dst_to_at'], params['dst_timezone'], params['dst_change_timestamp'], params['dst_offset'] = get_at_dst_changes_details(logger, session_key=params["session_key"])
# Bulk fetch the services of targeted kpis
if params['service_object']:
params['service_object'].bulk_fetch_service(kpidict.keys())
if params['entity_level_thresholds']:
params['entity_threshold_object'] = EntityThreshold(logger=params['logger'])
params['entity_threshold_object'].initialize_interface(
params['session_key'], owner='nobody')
if at_command:
list_kpis = []
for itsi_service_id in kpidict:
for itsi_kpi_id in kpidict[itsi_service_id]:
list_kpis.append(itsi_kpi_id)
# Get the Active Custom Threshold Windows which are of type percentage
ctw_object = CustomThresholdWindow(logger=logger)
ctw_object.initialize_interface(
params['session_key'], owner='nobody')
ctw_linked_kpis = ctw_object.bulk_fetch_active_ctw(list_kpis)
# Phase 2: iterate over (serviceid, kpiid) and output scores
for itsi_service_id in kpidict:
params['kpi'] = {
'service_id': itsi_service_id,
'service_data': None
}
if params['service_object']:
# save the service data
params['kpi']['service_data'] = params['service_object'].fetch_service(itsi_service_id)
for itsi_kpi_id in kpidict[itsi_service_id]:
params['kpi']['kpi_id'] = itsi_kpi_id
# get the KPI object
params['kpi']['kpi_object'] = get_kpi_object(params)
if params['kpi']['kpi_object'] is None:
ignore_invalid_row('No KPI found with id %s, ignoring ...' % itsi_kpi_id, logger)
continue
# get the settings
kpi_tmp = params['kpi']['kpi_object'].get_kpi()
if not isinstance(kpi_tmp, dict):
ignore_invalid_row('No valid KPI found with id %s, ignoring ...' % itsi_kpi_id, logger)
continue
if 'time_variate_thresholds_specification' not in kpi_tmp:
ignore_invalid_row(
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id,
logger
)
continue
params['kpi']['entity_thresholds'] = {}
if params['entity_level_thresholds']:
list_entity_keys = kpidict[itsi_service_id][itsi_kpi_id].keys()
entity_threshold_configs = params['entity_threshold_object'].bulk_fetch_configs(itsi_kpi_id, list_entity_keys, params['pseudo_entities'])
if not entity_threshold_configs:
break
for entity_config in entity_threshold_configs:
# Create temp Entity Key to store persistent entity config in a global object
entity_key = entity_config.get("entity_key") if entity_config.get("entity_key") != 'N/A' else hashlib.md5(( entity_config['entity_title'] + entity_config['kpi_id']).encode("utf-8")).hexdigest()
params['kpi']['entity_thresholds'].update({entity_key: entity_config})
params['kpi']['settings'] = kpi_tmp[
'time_variate_thresholds_specification']
params['kpi']['detect_outliers'] = False
params['kpi']['outlier_detection_algo'] = None
params['kpi']['outlier_multiplier'] = None
params['kpi']['adaptive_thresholding_training_window'] = kpi_tmp['adaptive_thresholding_training_window']
if 'aggregate_outlier_detection_enabled' in kpi_tmp:
params['kpi']['detect_outliers'] = False if params['entity_level_thresholds'] else kpi_tmp['aggregate_outlier_detection_enabled']
if kpi_tmp['aggregate_outlier_detection_enabled']:
if 'outlier_detection_algo' in kpi_tmp:
params['kpi']['outlier_detection_algo'] = kpi_tmp['outlier_detection_algo']
if 'outlier_detection_sensitivity' in kpi_tmp:
params['kpi']['outlier_multiplier'] = kpi_tmp['outlier_detection_sensitivity']
if at_command:
if kpi_tmp['_key'] in ctw_linked_kpis and kpi_tmp['adaptive_thresholds_is_enabled']:
kpi_tmp['recalculate_custom_thresholds'] = True
if params['kpi']['settings'] is not None:
if params['entity_level_thresholds']:
for entity_key in kpidict[itsi_service_id][itsi_kpi_id]:
# Ignore Entity from data if we don't have configuration available as kpi_entity_threshold
if not params['kpi']['entity_thresholds'].get(entity_key):
continue
calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id][entity_key], entity_config=params['kpi']['entity_thresholds'][entity_key])
else:
calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id], entity_config=None)
else:
ignore_invalid_row(
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id,
logger
)
continue
# Write output datain buffer
write_chunk(sys.stdout, params['out_metadata'], params['outbuf'].getvalue())
def calculate_thresholds(at_command, params, data, entity_config=None):
"""
Calculate Thresholds from data dict
@type: dict
@param data: Data points dictionary to parse to get new threshold values
@type: dict
@param entity_config: Entity Level Configuration object. Only for Entity level thresholding
"""
if at_command:
# create the schedule
the_schedule = create_schedule(params=params, entity_config=entity_config)
values = clean_values(
data=data,
params=params
)
thresholds = None
if at_command:
# compute the thresholds
if the_schedule is not None:
thresholds = the_schedule.get_thresholds(
data=values, params=params)
else:
thresholds = {}
values['policy_key'] = []
# write output to buffer
output_results(at_command=at_command, params=params, thresholds=thresholds, data=values, entity_config=entity_config)
def remove_outliers(data, method, multiplier):
if method is None:
method = "stdev"
if multiplier is None:
multiplier = 2
return apply_outlier_algorithm(data, method, multiplier, remove=True)
def apply_outlier_algorithm(data, method, multiplier, remove=False):
if data is None or 'alert_values' not in data:
raise Exception("Data is empty or not in correct format for applying outlier algorithm")
list_of_tuples = data['alert_values']
arr_floats = [float(i[0]) for i in list_of_tuples]
median = statistics.median(arr_floats)
# Calculate bounds
if method.lower() == 'mad':
mad = statistics.median([abs(val - median) for val in arr_floats])
upper_bound = median + (float(multiplier) * mad)
lower_bound = median - (float(multiplier) * mad)
elif method.lower() == 'iqr':
arr_floats = sorted(arr_floats)
iqr = quantile(arr_floats, 0.75) - quantile(arr_floats, 0.25)
upper_bound = median + (float(multiplier) * iqr)
lower_bound = median - (float(multiplier) * iqr)
elif method.lower() == 'stdev':
mean = statistics.mean(arr_floats)
stdev = statistics.stdev(arr_floats)
upper_bound = mean + (float(multiplier) * stdev)
lower_bound = mean - (float(multiplier) * stdev)
else:
raise Exception("Unsupported outlier detection method: %s" % method)
updated_values = []
outliers = []
# Iterate over data and mark outliers
for x in list_of_tuples:
x_list = list(x)
try:
x_list[3] = lower_bound
x_list[4] = upper_bound
except IndexError:
raise Exception(x_list)
if float(x[0]) > upper_bound or float(x[0]) < lower_bound:
x_list[2] = True
outliers.append(tuple(x_list))
if remove:
continue
updated_values.append(tuple(x_list))
data['alert_values'] = updated_values
return outliers, lower_bound, upper_bound
def detect_outliers(params, policy_chunks):
"""
Updates policy_chunks dictionary with outliers detected based on the method chosen
also returns outliers in a separate dictionary
"""
policy_outlier_map = collections.OrderedDict()
logger = params['logger']
# Identify outliers per policy block
for k, v in policy_chunks.items():
outliers = []
data = {'alert_values': v}
outliers, _, _ = apply_outlier_algorithm(data, params['method'].lower(), params['multiplier'])
logger.debug("%s outliers identified for method: %s, multiplier: %s, outliers: %s" % (len(outliers), params['method'], params['multiplier'], outliers))
policy_outlier_map[k] = outliers
policy_chunks[k] = data['alert_values']
return policy_outlier_map