You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1611 lines
76 KiB

# Copyright (C) 2005-2025 Splunk Inc. All Rights Reserved.
import csv
import copy
import collections
from .custom_threshold_window import CustomThresholdWindow
from .chunked_util import die, add_message, read_chunk, write_chunk
from datetime import datetime, timedelta
from .kpi import KPIBase, ServiceKPI, TempKPI, FileBackedKPI, Service, EntityThreshold
import logging
import math
import pytz
from pytz.exceptions import UnknownTimeZoneError
import statistics
import sys
from io import StringIO
import hashlib
from ITOA.itoa_common import is_feature_enabled
from ITOA.setup_logging import setup_logging
from itsi.objects.itsi_at_incremental_values import ItsiAtIncrementalValues
from itsi.itsi_time_block_utils import PolicyFilter
from tdigest import TDigest
from SA_ITOA_app_common.solnlib.conf_manager import ConfManager
##################
# itsiatutils
##################
# Utility module for AT and outlier detection custom search commands.
# Windows will mangle our line-endings unless we do this.
if sys.platform == "win32":
import os
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stderr.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
outlier_logger = setup_logging("itsi_apply_at_outliers.log", "itsi.apply_at.outliers", level=logging.DEBUG)
MIN_DATASET_LEN = 20
"""
Factor used to scale down number of KPIs processed per batch in the itsibatch CSC.
Limits the amount of KPI time series data needed to be passed to the applyat CSC by
training window.
"""
AT_SCALE_DOWN_FACTORS = {
'-7d': 1,
'-14d': 2,
'-30d': 4,
'-60d': 8,
}
"""
Map of the string representation of the adaptive thresholding training window to
the integer value of the number of days of the training window
"""
AT_WINDOW_TO_DAYS_MAP = {
'-7d': 7,
'-14d': 14,
'-30d': 30,
'-60d': 60,
}
def log_and_warn(metadata, logger, msg, search_msg=None):
search_msg = search_msg or msg
logger.warn(msg)
add_message(metadata, 'WARN', search_msg)
def log_and_die(metadata, logger, msg, search_msg=None):
logger.error(msg)
die(metadata, msg, search_msg)
def generate_at_search(kpi_ids, use_incremental_method=False, log_level='INFO'):
"""
Creates the search needed to run KPI level adaptive thresholding
@type: list
@param kpi_ids: the list of kpi_ids
@type: bool
@param use_incremental_method: flag indicating if the incremental method should be applied
@type: string
@param log_level: log_level for applyat command
"""
if not isinstance(kpi_ids, list) or len(kpi_ids) < 1:
return ''
incremental_method_flag = 'useincrementalmethod' if use_incremental_method else ''
itsi_kpi_ids = 'itsi_kpi_id IN (' + ', '.join(kpi_ids) + ')'
return '| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE ' \
'`get_itsi_summary_metrics_index` AND ( ' + itsi_kpi_ids + ' ) AND is_filled_gap_event!=1 ' \
'AND is_null_alert_value=0 `metrics_service_level_kpi_only` by itsi_kpi_id, ' \
'itsi_service_id span=1m | where alert_level!=-2 and not isnull(alert_value) | table _time, alert_value, alert_level, ' \
'itsi_kpi_id, itsi_service_id | applyat ' + incremental_method_flag + ' log_level=' + log_level
def generate_entity_at_search(entity_objects, use_incremental_method=False, log_level='INFO'):
"""
Creates the search needed to run entity level adaptive thresholding
@type: list
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
@type: bool
@param use_incremental_method: flag indicating if the incremental method should be applied
@type: string
@param log_level: log_level for applyat command
@rtype: string
@return: SPL search
"""
rv = ""
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
return rv
incremental_method_flag = 'useincrementalmethod' if use_incremental_method else ''
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key']
+ "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
base_string = """
| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE
`get_itsi_summary_metrics_index` AND ( {kpi_filter} ) AND is_filled_gap_event!=1 AND is_null_alert_value=0
AND `metrics_entity_level_kpi_only` by itsi_kpi_id, itsi_service_id, entity_key, entity_title span=1m
| where alert_level!=-2 and not isnull(alert_value)
| table _time, alert_value, alert_level, itsi_kpi_id, itsi_service_id, entity_key, entity_title
"""
rv += base_string.format(kpi_filter=kpi_filter_string)
rv += " | applyat entitylevelthreshold " + incremental_method_flag + " log_level=" + log_level
return rv
def generate_ml_entity_at_search(entity_objects, kpi_object):
"""
Creates the search needed to run entity-level ML-assisted adaptive thresholding
@type: list of dict
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
@type: dict
@param kpi_object: KPI object with entity AT parameters
@rtype: string
@return: SPL search
"""
rv = ""
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
return rv
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key']
+ "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
base_string = """
| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE
`get_itsi_summary_metrics_index` AND ( {kpi_filter} ) AND is_filled_gap_event!=1 AND is_null_alert_value=0
AND `metrics_entity_level_kpi_only` by itsi_kpi_id, itsi_service_id, entity_key, entity_title span=1m
| where alert_level!=-2 and not isnull(alert_value)
| table _time, alert_value, alert_level, itsi_kpi_id, itsi_service_id, entity_key, entity_title
"""
rv += base_string.format(kpi_filter=kpi_filter_string)
command_string = """
| recommendthresholdtemplate threshold_direction=auto entity_level_processing=true
send_to_api=true threshold_direction={threshold_direction} use_static={use_static}
analysis_window={analysis_window}
"""
rv += command_string.format(
threshold_direction=kpi_object["entity_threshold_direction"],
use_static=kpi_object["entity_keep_recommended_policy_static"],
analysis_window=kpi_object["entity_recommendation_training_window"],
)
if kpi_object.get("entity_recommendation_allow_negative_value") is not None:
rv += "non_negative={0} ".format(not kpi_object["entity_recommendation_allow_negative_value"])
if kpi_object.get("entity_recommendation_threshold_sensitivity") is not None:
rv += "sensitivity_level={0} ".format(kpi_object["entity_recommendation_threshold_sensitivity"])
return rv
def generate_ml_entity_at_scout_search(entity_objects):
"""
Creates the helper search for identifying entities with data for this KPI
@type: list of dict
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
@rtype: string
@return: SPL search
"""
rv = ""
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
return rv
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key']
+ "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
base_string = """
| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE
`get_itsi_summary_metrics_index` AND ( {kpi_filter} ) AND is_filled_gap_event!=1 AND is_null_alert_value=0
AND `metrics_entity_level_kpi_only` by itsi_kpi_id, itsi_service_id, entity_key, entity_title span=1d
| where alert_level!=-2 and not isnull(alert_value)
| dedup entity_key, entity_title
| table _time, alert_value, alert_level, itsi_kpi_id, itsi_service_id, entity_key, entity_title
"""
return base_string.format(kpi_filter=kpi_filter_string)
def divide_into_batches(ids, batch_size=1, incremental_method=False):
"""
Divides the ids in groups by batch size
@type: list
@param ids: the list of ids
@type: int
@param batch_size: batch size to divide ids into
@type: bool
@param incremental_method: flag indicating if incremental method should be applied
"""
for i in range(0, len(ids), batch_size):
yield [ids[i: i + batch_size], incremental_method]
def generate_searches(batches):
"""
Generate the SPL needed to run adaptive thresholding searches
@type: list
@param batches: the groups of ids ne
"""
for batch in batches:
yield generate_at_search(batch)
def quantile(data, q):
"""Naive implementation of linear-interpolated quantile.
Comparable to numpy.percentile()/pd.DataFrame.quantile().
Author: Jacob Leverich (jleverich@splunk.com)
"""
assert q >= 0. and q <= 1.
m = float(len(data) - 1)
i = m * q
ilow = math.floor(i)
ihigh = math.ceil(i)
if ilow == ihigh:
return data[int(ilow)]
f = (i - ilow) / (ihigh - ilow)
low = data[int(ilow)]
high = data[int(ihigh)]
return low + f * (high - low)
def quantiles(data, levels):
# Remove nan's if any in the data
data = [x for x in data if not math.isnan(x)]
data = sorted(data)
out = {x: quantile(data, float(x)) for x in levels}
return out
def parse_input_data(the_dict, data, fields_list, params):
"""
Populates the_dict with the values in data keyed by the fields in fields_list.
@param the_dict: dict keyed by service_id and then by kpi_id into which we will write the data
@param data: the incoming event data
@param fields_list: list of strings containing the field names to be added as data to the appropriate list in the_dict
@param params: Contains keys 'logger', 'use_kv_store', 'out_metadata', and 'kpi', the last of which contains 'service_id' and 'kpi_id'
"""
use_kv_store = params['use_kv_store']
logger = params['logger']
reader = csv.DictReader(data.splitlines(), dialect='excel')
for record in reader:
if 'itsi_service_id' not in record:
if not use_kv_store:
log_and_warn(metadata=params[
'out_metadata'], logger=logger, msg="Missing Service ID: %s. Generating dummy value." % repr(record))
record['itsi_service_id'] = 'DEFAULT_SERVICE_ID'
if 'itsi_kpi_id' not in record:
if not use_kv_store:
log_and_warn(metadata=params[
'out_metadata'], logger=logger, msg="Missing KPI ID: %s. Generating dummy value." % repr(record))
record['itsi_kpi_id'] = 'DEFAULT_KPI_ID'
if params['entity_level_thresholds'] and 'entity_title' not in record:
if not use_kv_store:
log_and_warn(metadata=params[
'out_metadata'], logger=logger, msg="Missing Entity Title: %s. Generating dummy value." % repr(record))
record['entity_title'] = 'DEFAULT_ENTITY_TITLE'
for f in fields_list:
if record[f] == '' and f != 'itsi_service_id' and f != 'itsi_kpi_id' and f != 'entity_title' and f != 'entity_key':
log_and_die(
metadata=params['out_metadata'], logger=logger, msg="Missing field %s at time %s" % (str(f), str(record['_time'])))
itsi_service_id = record['itsi_service_id']
itsi_kpi_id = record['itsi_kpi_id']
if itsi_service_id not in the_dict:
the_dict[itsi_service_id] = dict()
if itsi_kpi_id not in the_dict[itsi_service_id]:
the_dict[itsi_service_id][itsi_kpi_id] = dict()
if not params['entity_level_thresholds']:
tmpdict = {}
for f in fields_list:
tmpdict[f] = list()
the_dict[record['itsi_service_id']][record['itsi_kpi_id']] = tmpdict
if params['entity_level_thresholds']:
itsi_entity_key = record.get('entity_key', "N/A")
if itsi_entity_key == "N/A":
itsi_entity_key = hashlib.md5((record['entity_title'] + itsi_kpi_id).encode("utf-8")).hexdigest()
params['pseudo_entities'].update({itsi_entity_key: record['entity_title']})
if itsi_entity_key not in the_dict[itsi_service_id][itsi_kpi_id]:
tmpdict = {}
for f in fields_list:
tmpdict[f] = list()
the_dict[record['itsi_service_id']][record['itsi_kpi_id']][itsi_entity_key] = tmpdict
currentdict = the_dict[itsi_service_id][itsi_kpi_id][itsi_entity_key]
else:
currentdict = the_dict[itsi_service_id][itsi_kpi_id]
for f in fields_list:
currentdict[f].append(record[f])
def drop_dup(data, index):
"""Naive re-implementation of pd.DataFrame.drop_duplicates()"""
out_data = {k: [] for k in list(data.keys())}
last = None
for i, v in enumerate(data[index]):
if v != last:
for k in list(data.keys()):
out_data[k].append(data[k][i])
last = v
return out_data
def clean_values(data, params):
"""Non-pandas replacement for atad_utils.create_dataframe().
@param data: dict of '_time': list(epoch timestamp strings)
'alert_value': list(float strings)
'alert_period': list(float strings) optional?
@param params: dict with keys 'logger' and 'out_metadata'
"""
logger = params['logger']
metadata = params['out_metadata']
values = dict(data)
for i in range(len(values['_time'])):
try:
values['_time'][i] = float(values['_time'][i])
except ValueError:
log_and_warn(metadata, logger, "Can't parse _time '%s' as float" % values['_time'][i])
values['_time'][i] = float('nan')
# Drop duplicates
values = drop_dup(values, '_time')
for i in range(len(values['alert_value'])):
try:
values['alert_value'][i] = float(values['alert_value'][i])
except ValueError:
log_and_warn(metadata, logger, "Can't parse alert_value '%s' as float" % values['alert_value'][i])
values['alert_value'][i] = float('nan')
if 'alert_period' in values:
for i in range(len(values['alert_period'])):
try:
values['alert_period'][i] = float(values['alert_period'][i])
except ValueError:
log_and_warn(metadata, logger, "Can't parse alert_period '%s' as float" % values['alert_period'][i])
values['alert_period'][i] = float('nan')
return values
def get_service_object(params):
service_object = None
if params['use_kv_store'] and not params['use_temp_collection']:
service_object = Service(logger=params['logger'])
service_object.initialize_interface(
params['session_key'], owner='nobody')
return service_object
def get_kpi_object(params):
kpi_object = None
if params['use_kv_store']:
if params['use_temp_collection'] and params['temp_collection'] is not None and params['temp_key'] is not None:
kpi_object = TempKPI(logger=params['logger'], temp_collection_name=params['temp_collection'], temp_object_key=params['temp_key'])
else:
kpi_object = ServiceKPI(
logger=params['logger'], service_data=params['kpi']['service_data'], kpi_id=params['kpi']['kpi_id'])
kpi_object.initialize_interface(
params['session_key'], owner='nobody', namespace='SA-ITOA')
kpi_object.fetch_kpi()
params['logger'].debug(
"Initialized KV interface with session key %s" % params['session_key'])
elif params['settings_file'] is not None:
kpi_object = FileBackedKPI(
logger=params['logger'], filename=params['settings_file'])
return kpi_object
# Policy Class
class Policy(object):
def __init__(self, key, method, parameters, at_run_params, **kwargs):
# validate methods and parameters
if not isinstance(key, str):
raise ValueError(
"Null or non-string key sent to Policy constructor.")
if not isinstance(method, str):
raise ValueError(
"Null or non-string method sent to Policy constructor. Must be a string: stdev, quantile, range, or percentage.")
method_str = str(method)
if method_str not in ['stdev', 'quantile', 'range', 'percentage']:
raise ValueError(
"Method must be one of stdev, quantile, range, or percentage.")
if not parameters: # parameters is a list of theshold levels
raise ValueError("Null parameters sent to Policy constructor.")
if not isinstance(parameters, list) or len(parameters) > 10:
raise ValueError(
"Parameters must be a list of no more than 10 levels.", parameters)
if not all('dynamicParam' in x for x in parameters):
raise ValueError("Every level record must have a dynamicParam attribute")
# store policies in form amenable to computing thresholds
self.key = key
self.method = method_str
self.parameters = parameters
self.title = kwargs.get('title', key)
self.logger = kwargs.get('logger')
self.at_run_params = at_run_params
@property
def parameter_values(self):
# property that extracts dynamic param values from parameter list
return [float(x['dynamicParam']) for x in self.parameters]
def get_updated_levels(self, computed_thresholds, kpi_id, service_id):
"""
Returns a copy of the levels structure stored in self.parameters
where thresholdValue field is updated from the computed levels array
"""
if len(computed_thresholds) != len(self.parameters):
raise ValueError("Computed thresholds and stored thresholds structures are not of the same length")
result = []
for computed_value, level in zip(computed_thresholds, self.parameters):
level_copy = copy.copy(level)
level_copy['thresholdValue'] = computed_value
result.append(level_copy)
self.logger.debug("Calculated thresholdLevels for policy %s of kpi %s and service %s are %s", self.key, kpi_id, service_id, result)
return result
def update_outlier_incremental_value(self, value_to_update, alert_value, outlier_method):
"""
Updates the incremental value appropriately based on the alert
value and policy type with data for use in outlier exclusion calculation
@type: dict
@param value_to_update: incremental values object to apply updates on
@type: float
@param alert_value: KPI alert value to update incremental values with
@type: string
@param: outlier_method: outlier exclusion algorithm method of the KPI
"""
if isinstance(alert_value, (int, float)):
if outlier_method == 'stdev':
if 'unfiltered_sum' in value_to_update:
value_to_update['unfiltered_sum'] += alert_value
value_to_update['unfiltered_count'] += 1
value_to_update['unfiltered_sum_of_squares'] += alert_value ** 2
else:
value_to_update['unfiltered_sum'] = alert_value
value_to_update['unfiltered_count'] = 1
value_to_update['unfiltered_sum_of_squares'] = alert_value ** 2
elif outlier_method in ('mad', 'iqr'):
if 'unfiltered_digest' not in value_to_update:
value_to_update['unfiltered_digest'] = TDigest()
value_to_update['unfiltered_digest'].update(alert_value)
else:
raise Exception("Unsupported outlier detection method: %s" % outlier_method)
def update_incremental_value(self, value_to_update, alert_value):
"""
Updates the incremental value appropriately based on the alert
value and policy type
@type: dict
@param value_to_update: incremental values object to apply updates on
@type: float
@param alert_value: KPI alert value to update incremental values with
"""
if isinstance(alert_value, (int, float)):
if self.method == 'stdev':
if 'sum' in value_to_update:
value_to_update['sum'] += alert_value
value_to_update['sum_of_squares'] += alert_value ** 2
value_to_update['count'] += 1
else:
value_to_update['sum'] = alert_value
value_to_update['sum_of_squares'] = alert_value ** 2
value_to_update['count'] = 1
elif self.method == 'range':
if 'min' in value_to_update:
value_to_update['min'] = min(value_to_update['min'], alert_value)
value_to_update['max'] = max(value_to_update['max'], alert_value)
value_to_update['count'] += 1
else:
value_to_update['min'] = alert_value
value_to_update['max'] = alert_value
value_to_update['count'] = 1
elif self.method == 'quantile':
if 'digest' not in value_to_update:
value_to_update['digest'] = TDigest()
value_to_update['digest'].update(alert_value)
elif self.method == 'percentage':
if 'sum' in value_to_update:
value_to_update['sum'] += alert_value
value_to_update['count'] += 1
else:
value_to_update['sum'] = alert_value
value_to_update['count'] = 1
def apply_outlier_algorithm_incremental(self, method, multiplier, incremental_values, kpi_values):
"""
Calculates and marks outliers in the KPI data based on the outlier exclusion algorithm method
and multiplier
@type: string
@param method: outlier exclusion algorithm method of the KPI
@type: float
@param multiplier: the sensitivity multiplier of the outlier exclusion algoirthm
@type: list
@param: incremental_values: incremental values containing aggregate KPI data
@type: list
@param: kpi_values: kpi value data [timestamp, value, outlier flag, lower bound, upper bound]
@return: a list containing update KPI data values, outlier count, and bounds data
"""
if method is None:
method = "stdev"
if multiplier is None:
multiplier = 2
kpi_values_copy = [value for _, value, _, _, _ in kpi_values]
lower_bound, upper_bound = None, None
if method == 'mad':
digest = TDigest()
for value in incremental_values:
if 'unfiltered_digest' in value:
digest += value['unfiltered_digest']
if digest.n > 0:
median = digest.percentile(50)
mad = statistics.median([abs(val - median) for val in kpi_values_copy])
upper_bound = median + (float(multiplier) * mad)
lower_bound = median - (float(multiplier) * mad)
elif method == 'iqr':
digest = TDigest()
for value in incremental_values:
if 'unfiltered_digest' in value:
digest += value['unfiltered_digest']
if digest.n > 0:
median = digest.percentile(50)
iqr = digest.percentile(75) - digest.percentile(25)
upper_bound = iqr + (float(multiplier) * median)
lower_bound = iqr - (float(multiplier) * median)
elif method == 'stdev':
total_count, total_sum, total_sum_of_squares = 0, 0, 0
for value in incremental_values:
if 'unfiltered_sum' in value:
total_sum += value['unfiltered_sum']
total_count += value['unfiltered_count']
total_sum_of_squares += value['unfiltered_sum_of_squares']
if total_count > 0:
mean = total_sum / total_count
variance = (total_sum_of_squares - 2 * mean * total_sum + total_count * mean ** 2) / total_count
stdev = math.sqrt(variance)
upper_bound = mean + (float(multiplier) * stdev)
lower_bound = mean - (float(multiplier) * stdev)
else:
raise Exception("Unsupported outlier detection method: %s" % method)
updated_values = []
outlier_count = 0
if lower_bound is not None and upper_bound is not None:
for value in kpi_values:
value = list(value)
is_outlier = lower_bound > value[0] or value[0] > upper_bound
value[2], value[3], value[4] = is_outlier, lower_bound, upper_bound
if is_outlier:
outlier_count += 1
updated_values.append(tuple(value))
return updated_values, outlier_count, lower_bound, upper_bound
def cycle_incremental_values(self, values_to_update, new_values, params, entity_config={}):
"""
Performs the cycling process of the incremental values, adding a new value for
the past 24 hours and removing any expired values outside of the training window
@type: list
@param: values_to_update: list of incremental values to update
@type: list
@param: new_values: kpi value data [timestamp, value, outlier flag, lower bound, upper bound]
@type: dict
@param params: applyat custom search command parameters
@type: object
@param entity_config: configuration object for entity level thresholding
"""
now = datetime.now()
try:
if params['dst_timezone']:
tz = pytz.timezone(params['dst_timezone'])
now = datetime.now(tz)
except UnknownTimeZoneError as e:
self.logger.exception(e)
self.logger.error('Found Unknown timezone')
updated_data_values = new_values
outlier_method = None
if (not entity_config and params['kpi']['detect_outliers']) or \
(entity_config and 'aggregate_outlier_detection_enabled' in entity_config and entity_config['aggregate_outlier_detection_enabled']):
outlier_method = params['kpi']['outlier_detection_algo'] if not entity_config else entity_config['outlier_detection_algo']
outlier_multiplier = params['kpi']['outlier_multiplier'] if not entity_config else entity_config['outlier_detection_sensitivity']
outliers, count, lower_bound, upper_bound = self.apply_outlier_algorithm_incremental(outlier_method, outlier_multiplier, values_to_update, new_values)
updated_data_values = outliers
outlier_dict = {
'kpi_id': params['kpi']['kpi_id'],
'service_id': self.at_run_params['service_id'],
'policy_key': self.key,
'training_window': self.at_run_params['training_window'],
'at_run_epoch': self.at_run_params['at_run_epoch'],
'use_temp_collection': self.at_run_params['use_temp_collection'],
'method': outlier_method,
'multiplier': outlier_multiplier,
'count': count,
'lower_bound': lower_bound,
'upper_bound': upper_bound
}
outlier_logger.info(outlier_dict)
self.logger.info("KPI: %s, %s outliers identified and removed" % (params['kpi']['kpi_id'], count))
training_window = self.at_run_params['training_window']
last_day = now - timedelta(days=1)
start_of_day = last_day.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
if start_of_day != values_to_update[-1]['timestamp']:
new_incremental_value = { 'timestamp': start_of_day }
for data in updated_data_values:
alert_value = data[0]
timestamp = data[1]
if outlier_method and timestamp >= start_of_day:
self.update_outlier_incremental_value(new_incremental_value, alert_value, outlier_method)
if data[2]:
continue
if timestamp >= start_of_day:
self.update_incremental_value(new_incremental_value, alert_value)
values_to_update.append(new_incremental_value)
training_window_diff = len(values_to_update) - AT_WINDOW_TO_DAYS_MAP[training_window]
if training_window_diff > 0:
del values_to_update[:training_window_diff]
def get_incremental_values(self, values, params, entity_config={}):
"""
Returns a list of incremental values split by days of the training window
with calculations based on the policy type
"""
now = datetime.now()
try:
if params['dst_timezone']:
tz = pytz.timezone(params['dst_timezone'])
now = datetime.now(tz)
except UnknownTimeZoneError as e:
self.logger.exception(e)
self.logger.error('Found Unknown timezone')
training_window = self.at_run_params['training_window']
incremental_values = []
for days_ago in range(AT_WINDOW_TO_DAYS_MAP[training_window]):
day = now - timedelta(days=days_ago + 1)
start_of_day = day.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
incremental_values.insert(0, { 'timestamp': start_of_day })
if len(values) < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points to calculate thresholds in policy: %s, values: %s" % (MIN_DATASET_LEN, self.key, values))
return incremental_values
cur_index = 0
next_timestamp = incremental_values[cur_index + 1]['timestamp']
outlier_method = None
updated_values = values
if not entity_config and params['kpi']['detect_outliers']:
alert_value_data = {'alert_values': values}
outlier_method = params['kpi']['outlier_detection_algo']
outlier_multiplier = params['kpi']['outlier_multiplier']
apply_outlier_algorithm(alert_value_data, outlier_method, outlier_multiplier)
updated_values = alert_value_data['alert_values']
elif entity_config and 'aggregate_outlier_detection_enabled' in entity_config and entity_config['aggregate_outlier_detection_enabled']:
alert_value_data = {'alert_values': values}
outlier_method = entity_config['outlier_detection_algo']
outlier_multiplier = entity_config['outlier_detection_sensitivity']
apply_outlier_algorithm(alert_value_data, outlier_method, outlier_multiplier)
updated_values = alert_value_data['alert_values']
for data in updated_values:
alert_value = data[0]
timestamp = data[1]
if timestamp >= next_timestamp:
cur_index += 1
if cur_index + 1 >= AT_WINDOW_TO_DAYS_MAP[training_window]:
next_timestamp = now.timestamp()
else:
next_timestamp = incremental_values[cur_index + 1]['timestamp']
if outlier_method and timestamp >= incremental_values[cur_index]['timestamp']:
self.update_outlier_incremental_value(incremental_values[cur_index], alert_value, outlier_method)
if data[2]:
continue
if timestamp >= incremental_values[cur_index]['timestamp']:
self.update_incremental_value(incremental_values[cur_index], alert_value)
return incremental_values
# returns a copy of threshold levels structure with thresholdValue field updated
def get_thresholds(self, values, kpi_dict, use_incremental_method=False, entity_config=False):
if self.method is None:
raise UnboundLocalError("No method set for Policy.")
if not use_incremental_method:
data = {'alert_values': values}
if len(values) < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points to calculate thresholds in policy: %s, values: %s" % (MIN_DATASET_LEN, self.key, values))
return None
if (not entity_config and kpi_dict['detect_outliers']) or \
(entity_config and 'aggregate_outlier_detection_enabled' in entity_config and entity_config['aggregate_outlier_detection_enabled']):
outlier_method = kpi_dict['outlier_detection_algo'] if not entity_config else entity_config['outlier_detection_algo']
outlier_multiplier = kpi_dict['outlier_multiplier'] if not entity_config else entity_config['outlier_detection_sensitivity']
outliers, lower_bound, upper_bound = remove_outliers(data, outlier_method, outlier_multiplier)
outlier_dict = {
'kpi_id': kpi_dict['kpi_id'],
'service_id': self.at_run_params['service_id'],
'policy_key': self.key,
'training_window': self.at_run_params['training_window'],
'at_run_epoch': self.at_run_params['at_run_epoch'],
'use_temp_collection': self.at_run_params['use_temp_collection'],
'method': outlier_method,
'multiplier': outlier_multiplier,
'count': len(outliers),
'lower_bound': lower_bound,
'upper_bound': upper_bound
}
# Write outliers metadata to outlier log.
outlier_logger.info(outlier_dict)
self.logger.info("KPI: %s, %s outliers identified and removed: %s" % (kpi_dict['kpi_id'], len(outliers), outliers))
filtered_values = data['alert_values']
D = {'alert_values': [v[0] for v in filtered_values if not math.isnan(v[0])]}
if len(D['alert_values']) < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points in policy: %s, %s" % (MIN_DATASET_LEN, self.key, values))
return None
if self.method == 'stdev': # pretty standard, really
if use_incremental_method:
total_count = 0
total_sum = 0
total_sum_squares = 0
for value in values:
if 'sum' in value:
total_count += value['count']
total_sum += value['sum']
total_sum_squares += value['sum_of_squares']
if total_count < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points for stdev to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
return None
mean = total_sum / total_count
variance = (total_sum_squares - 2 * mean * total_sum + total_count * (mean ** 2)) / total_count
std = math.sqrt(variance)
else:
# Simple two-pass algorithm for calculating stdev. Reasonably numerically stable.
mean = sum(D['alert_values']) / len(D['alert_values'])
sqe = sum((x - mean) ** 2. for x in D['alert_values'])
std = math.sqrt(sqe / (len(D['alert_values']) - 1))
if std == 0.0:
# Very rare scenario when all the alert values are the same,
# setting it to a non-zero value based on a heuristic.
self.logger.info("STD evaluated as 0, setting it to a non-zero value.")
std = mean * 0.001 + 0.001 # 1000th of the mean
return self.get_updated_levels([mean + (std * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
# formerly iqr and same as "mass" in prior iterations
elif self.method == 'quantile':
if use_incremental_method:
digest = TDigest()
for value in values:
if 'digest' in value:
digest += value['digest']
if digest.n < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points for range to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
return None
T = {x: digest.percentile(x * 100) for x in self.parameter_values}
else:
T = quantiles(D['alert_values'], self.parameter_values)
return self.get_updated_levels([T[k] for k in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
elif self.method == 'range': # equal width bands
if use_incremental_method:
dmax = -math.inf
dmin = math.inf
total_count = 0
for value in values:
if 'min' in value:
dmax = max(dmax, value['max'])
dmin = min(dmin, value['min'])
total_count += value['count']
if total_count < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points for range to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
return None
else:
dmax = max(D['alert_values'])
dmin = min(D['alert_values'])
span = dmax - dmin
return self.get_updated_levels([dmin + (span * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
elif self.method == 'percentage':
if use_incremental_method:
total_sum = 0
total_count = 0
for value in values:
if 'sum' in value:
total_sum += value['sum']
total_count += value['count']
if total_count < MIN_DATASET_LEN:
self.logger.error("There are less than %s data points for percentage to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
return None
mean = total_sum / total_count
else:
# Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage
mean = sum(D['alert_values']) / len(D['alert_values'])
return self.get_updated_levels([mean * (1 + c / 100) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
else:
raise ValueError("Invalid thresholding method: " + self.method)
# Schedule Class
class Schedule(object):
# policies: dict of Policy Objects keyed by policy.key
# schedule: dict of policy_keys keyed by block_keys
def __init__(self, kpi_object, policies, threshold_spec, params):
# validate kpi
if kpi_object is None:
raise ValueError("Null KPI object sent to Schedule constructor.")
if not isinstance(kpi_object, KPIBase):
raise ValueError("KPI parameter must be a kpi.KPI object")
# validate policies
if policies is None:
raise ValueError("Null policy dict sent to Schedule constructor.")
if not isinstance(policies, dict):
raise ValueError(
"Policies parameter must be a dict, got %s." % type(policies))
if len(policies) > 169 or len(policies) == 0:
raise ValueError(
"Policies parameter must be a dict of no more than 168 Policy objects, got %s." % len(policies))
if sum([1 if not isinstance(p, Policy) else 0 for p in list(policies.values())]) > 0:
raise ValueError("All policies must be Policy objects.")
self.logger = None
if 'logger' in params:
self.logger = params['logger']
self.kpi_object = kpi_object
self.policies = policies
self.filter = PolicyFilter(threshold_spec)
self.incremental_values = None
def _parse_digest_data(self, incremental_value, field):
"""
Parses the serialized tdigest data stored in the incremental value and
creates a tdigest object from it
@type: dict
@param: incremental_value: incremental value with serialized tdigest data
@type: string
@param: field: field containing the tdigest data
@rtype: object
@return: parsed tdigest object
"""
digest = TDigest()
digest.K = incremental_value[field].get('K')
digest.delta = incremental_value[field].get('delta')
[digest.update(value['m'], value['c']) for value in incremental_value[field].get('centroids')]
return digest
def _get_thresholds(self, data, params, entity_config={}):
if data is None:
raise ValueError("Null data sent to Schedule.")
if not isinstance(data, dict) or 'alert_value' not in data:
raise ValueError(
"Data passed to Schedule must be a dict with values in column 'alert_values'." + str(data))
data_key = params['kpi']['kpi_id'] if not entity_config else entity_config['_key']
if params['use_incremental_method']:
at_incremental_values_obj = ItsiAtIncrementalValues(params["session_key"], 'nobody')
kpi_at_incremental_values = at_incremental_values_obj.get_kpi_at_incremental_values('nobody', data_key)
for policy in kpi_at_incremental_values['policies']:
for value in kpi_at_incremental_values['policies'][policy]['incremental_values']:
if 'digest' in value:
value['digest'] = self._parse_digest_data(value, 'digest')
if 'unfiltered_digest' in value:
value['unfiltered_digest'] = self._parse_digest_data(value, 'unfiltered_digest')
elif params['incremental_learning_enabled']:
kpi_at_incremental_values = {
'_key': data_key,
'policies': {}
}
# divide data based on policy: D[policy_key] = [tuples]
D = {}
for policy_key in self.policies:
D[policy_key] = []
index_converted = data['_time']
active_policies = set()
for data_index in range(len(index_converted)):
# If the apply_dst_to_at is enabled then shift add timestamp to the dst offset
if not params["disable_dst_to_at"] and params["dst_change_timestamp"] > 0 and params["dst_offset"] != 0:
# If _time is less than the last dst_change_timestamp then add it with the dst_offset
if index_converted[data_index] < params["dst_change_timestamp"]:
index_converted[data_index] = index_converted[data_index] + params["dst_offset"]
# provide a timestamp and TZ, get the policy that includes this timestamp
policy_key = self.filter.get_policy_key(time=index_converted[data_index])
if policy_key in D:
D[policy_key].append((data['alert_value'][data_index], index_converted[data_index], False, 0, 0))
active_policies.add(policy_key)
# compute and accumulate the thresholds for each Policy
T = {}
insufficient_data_policies = []
should_create_incremental_values = True
for policy_key in self.policies:
the_data = D[policy_key]
if params['use_incremental_method']:
self.policies[policy_key].cycle_incremental_values(kpi_at_incremental_values['policies'][policy_key]['incremental_values'], the_data, params, entity_config)
T[policy_key] = self.policies[policy_key].get_thresholds(kpi_at_incremental_values['policies'][policy_key]['incremental_values'], params['kpi'], True, entity_config)
self.incremental_values = kpi_at_incremental_values
elif not entity_config:
policy_type = params['kpi']['settings']['policies'][policy_key]['policy_type']
time_blocks = params['kpi']['settings']['policies'][policy_key]['time_blocks']
if params['incremental_learning_enabled'] and should_create_incremental_values and policy_type in ('stdev', 'range', 'percentage', 'quantile'):
dynamic_params = [{'severityValue': tl['severityValue'], 'dynamicParam': tl['dynamicParam']} for tl in params['kpi']['settings']['policies'][policy_key]['aggregate_thresholds']['thresholdLevels']]
kpi_at_incremental_values['policies'][policy_key] = { 'policy_type': policy_type, 'time_blocks': time_blocks, 'dynamic_params': dynamic_params }
kpi_at_incremental_values['policies'][policy_key]['incremental_values'] = self.policies[policy_key].get_incremental_values(the_data, params)
kpi_at_incremental_values["aggregate_outlier_detection_enabled"] = params['kpi']['detect_outliers']
kpi_at_incremental_values["outlier_detection_algo"] = params['kpi']['outlier_detection_algo']
kpi_at_incremental_values["outlier_detection_sensitivity"] = params['kpi']['outlier_multiplier']
kpi_at_incremental_values["adaptive_thresholding_training_window"] = params['kpi']['adaptive_thresholding_training_window']
self.incremental_values = kpi_at_incremental_values
T[policy_key] = self.policies[policy_key].get_thresholds(the_data, params['kpi'])
else:
policy_type = entity_config['time_variate_thresholds_specification']['policies'][policy_key]['policy_type']
time_blocks = entity_config['time_variate_thresholds_specification']['policies'][policy_key]['time_blocks']
if params['incremental_learning_enabled'] and should_create_incremental_values and policy_type in ('stdev', 'range', 'percentage', 'quantile'):
dynamic_params = [{'severityValue': tl['severityValue'], 'dynamicParam': tl['dynamicParam']} for tl in entity_config['time_variate_thresholds_specification']['policies'][policy_key]['entity_thresholds']['thresholdLevels']]
kpi_at_incremental_values['policies'][policy_key] = { 'policy_type': policy_type, 'time_blocks': time_blocks, 'dynamic_params': dynamic_params }
kpi_at_incremental_values['policies'][policy_key]['incremental_values'] = self.policies[policy_key].get_incremental_values(the_data, params, entity_config)
kpi_at_incremental_values["adaptive_thresholding_training_window"] = entity_config['adaptive_thresholding_training_window']
if 'aggregate_outlier_detection_enabled' in entity_config:
kpi_at_incremental_values["aggregate_outlier_detection_enabled"] = entity_config['aggregate_outlier_detection_enabled']
kpi_at_incremental_values["outlier_detection_algo"] = entity_config['outlier_detection_algo']
kpi_at_incremental_values["outlier_detection_sensitivity"] = entity_config['outlier_detection_sensitivity']
self.incremental_values = kpi_at_incremental_values
T[policy_key] = self.policies[policy_key].get_thresholds(the_data, params['kpi'], False, entity_config)
if T[policy_key] is None and policy_key in active_policies:
insufficient_data_policies.append(self.policies[policy_key].title)
self.logger.info(
"Insufficient data for threshold calculation: %d values." % len(D[policy_key]))
if len(insufficient_data_policies) > 0:
add_message(params['out_metadata'], 'WARN',
'insufficient data in ITSI summary index for policies %s' % str(insufficient_data_policies))
return T
def get_thresholds(self, data, params, entity_config={}):
"""Computes thresholds for a KPI and this schedule.
:param data: dict with 'alert_value': list of floats
'_time': list of float epoch timestamps
:param params: dict with kpi settings
:param entity_config: entity level configuration object
Returns a dict of lists of threshold level structures, keyed by policy.key;
the structures should have a populated `thresholdValue` field obtained from the result of the computation
"""
metadata = params['out_metadata']
thresholds = {}
kpi_info = 'kpiid="%s" on serviceid="%s"' % (str(params['kpi']['kpi_id']), str(params['kpi']['service_id']))
try:
thresholds = self._get_thresholds(data=data, params=params, entity_config=entity_config)
except ValueError:
log_and_warn(metadata=metadata, logger=self.logger,
msg='Unconvertible alert_values found for ' + kpi_info,
search_msg="unconvertible values found (check this KPI's `alert_value` "
"field in ITSI summary index")
except AssertionError:
# Method should probably raise a ValueError/try to convert 0-100 to 0.0-1.0, but for now log nicely
log_and_warn(metadata=metadata, logger=self.logger,
msg='Invalid quantile specified for %s, must be between 0.0 and 1.0' % kpi_info,
search_msg='invalid quantile value, must be between 0.0 and 1.0')
except Exception as e:
log_and_warn(metadata=metadata, logger=self.logger, msg=str(e))
log_and_warn(metadata=metadata, logger=self.logger,
msg='Unexpected exception when computing thresholds for %s' % kpi_info)
return thresholds
def create_schedule(params, entity_config=None):
policies = {}
metadata = params['out_metadata']
settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings']
logger = params['logger']
# get policy settings for this KPI, create Policy objects
for policy_key in settings['policies']:
t_method = str(settings['policies'][policy_key]['policy_type'])
t_title = str(settings['policies'][policy_key].get('title', policy_key))
try:
t_levels = settings['policies'][policy_key][params['threshold_key']]['thresholdLevels']
except KeyError as e:
# we just skip this policy
logger.exception(e)
log_and_warn(metadata=metadata, logger=logger, msg="Failed to retrieve %ss: %s" % (params['threshold_key'], e))
continue
policy_key = str(policy_key)
if t_method == 'static':
logger.info("Skipping static policy '%s'", policy_key)
elif not isinstance(t_levels, list) or not t_levels:
log_and_warn(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': please specify threshold values "
"for the policy" % t_title)
continue
else:
skip_policy = False
for x in t_levels:
if 'dynamicParam' not in x:
log_and_warn(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': Missing threshold "
"value." % t_title)
skip_policy = True
break
try:
float(x['dynamicParam'])
except (TypeError, ValueError):
log_and_warn(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': Invalid threshold "
"value: %s" % (t_title, x['dynamicParam']))
skip_policy = True
break
if skip_policy:
continue
logger.debug("Loading settings for policy %s: method=%s levels=%s" % (
policy_key, t_method, t_levels))
try:
at_run_params = {'at_run_epoch': params['at_run_epoch'],
'use_temp_collection': params['use_temp_collection'],
'service_id': params['kpi']['service_id'],
'training_window': params['kpi']['adaptive_thresholding_training_window']}
policies[policy_key] = Policy(
key=policy_key, method=t_method, parameters=t_levels, title=t_title, logger=logger, at_run_params=at_run_params)
except ValueError as e:
logger.exception(e)
log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Policy.")
the_schedule = None
if len(policies) == 0:
return
try:
the_schedule = Schedule(
kpi_object=params['kpi']['kpi_object'], policies=policies, threshold_spec=settings, params=params)
except ValueError as e:
logger.exception(e)
log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Schedule.")
return the_schedule
def output_results(at_command, params, thresholds, data, entity_config=None):
"""
thresholds: dict of lists of threshold levels structures, keyed by policy id
"""
settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings']
service_id = params['kpi']['service_id']
kpi_id = params['kpi']['kpi_id']
if not thresholds and not at_command:
alerts_converted = data["alert_value"]
time_converted = data["_time"]
filter = PolicyFilter(settings)
for index in range(len(time_converted)):
try:
alert_val = alerts_converted[index]
time_val = time_converted[index]
policy_key = filter.get_policy_key(time_val)
line = {
'policy_key': policy_key, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id,
'alert_value': alert_val, '_time': time_val
}
if entity_config:
line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']})
except IndexError:
raise Exception(data)
params['writer'].writerow(line)
else:
for policy_id in thresholds:
t = thresholds[policy_id]
if t is not None:
if params['use_kv_store']:
if len(t) != len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels']):
kpistr = ""
if service_id is not None and kpi_id is not None and service_id != "" and kpi_id != "":
kpistr = " for kpi %s" % str(service_id) + ":" + str(kpi_id)
found = len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels'])
msg = "Mismatched number of thresholdLevels: %s. Generated %d but found %d." % (
kpistr, len(t), found)
log_and_warn(metadata=params['out_metadata'], logger=params['logger'], msg=msg)
else:
# n.b. we assume thresholdLevels objects are
# sorted by increasing thresholdValue
# move this update_thresholds to outside
if entity_config:
params['entity_threshold_object'].update_thresholds(
policy=policy_id, thresholds=t, entity=entity_config)
else:
params['kpi']['kpi_object'].update_thresholds(
policy=policy_id, thresholds=t)
line = {
'policy_id': policy_id, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id}
if entity_config:
line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']})
for thresh_index in range(len(t)):
line['threshold_' + str(thresh_index)] = t[thresh_index].get('thresholdValue')
line['threshold_metadata_' + str(thresh_index)] = t[thresh_index]
params['writer'].writerow(line)
if params['kpi']['adaptive_thresholding_copy_kpi_to_entity'] and not entity_config:
params['kpi']['kpi_object'].copy_kpi_thresholds_to_base_entity_thresholds()
return
def ignore_invalid_row(warn_message, logger):
"""
Method to log warning and ignore read row result
Assumes read_chunk was invoked before this method is invoked
@type: basestring
@param warn_message: warning message to log
@rtype: None
@return: None
"""
logger.warn(warn_message)
# Dummy response to ignore
write_chunk(sys.stdout, {"finished": False}, '')
def gather_input_data(params, logger, fields_list):
kpidict = dict() # kpidict['itsi_service_id']['itsi_kpi_id']
while True:
params['out_metadata']['finished'] = False
ret = read_chunk(sys.stdin, logger)
if not ret:
break
metadata, body = ret
parse_input_data(
the_dict=kpidict, data=body, fields_list=fields_list, params=params)
write_chunk(sys.stdout, params['out_metadata'], '')
if metadata.get('finished', False):
break
params['kpidict'] = kpidict
params['outbuf'] = StringIO()
def last_dst_change_timestamp(logger, dst_timezone):
"""
Returns the last dst change timestamp for the provided timestamp
@type logger: logger object
@param logger: logger object
@type: string
@param dst_timezone: timezone provided by the user in itsi_settings.conf
"""
try:
# Creates a time zone object based on the provided timezone_name
try:
tz = pytz.timezone(dst_timezone)
except UnknownTimeZoneError as e:
logger.exception(e)
logger.error('Found Unknown timezone')
if tz:
# Gets the current time in the specified time zone.
now = datetime.now(tz)
# Gets a list of UTC transition times for the time zone.
# These are the times at which the offset from UTC changes due to daylight
# saving time changes or other reasons.
# For example [
# datetime.datetime(2023, 3, 12, 10, 0),
# datetime.datetime(2023, 11, 5, 9, 0),
# datetime.datetime(2024, 3, 10, 10, 0),
# datetime.datetime(2024, 11, 3, 9, 0),
# datetime.datetime(2025, 3, 9, 10, 0),
# datetime.datetime(2025, 11, 2, 9, 0)
# ]
transitions = list(tz._utc_transition_times)
# Converts each UTC transition time to a datetime object in UTC time zone
# [
# datetime.datetime(2023, 3, 12, 10, 0, tzinfo=<UTC>),
# datetime.datetime(2023, 11, 5, 9, 0, tzinfo=<UTC>),
# datetime.datetime(2024, 3, 10, 10, 0, tzinfo=<UTC>),
# datetime.datetime(2024, 11, 3, 9, 0, tzinfo=<UTC>),
# datetime.datetime(2025, 3, 9, 10, 0, tzinfo=<UTC>),
# datetime.datetime(2025, 11, 2, 9, 0, tzinfo=<UTC>)
# ]
transitions = [pytz.utc.localize(transition) for transition in transitions]
# Sorts the list of transition times in ascending order.
transitions.sort()
last_dst_change = None
for transition_time in transitions:
# If the transition time is before the current time,
# it updates the last_dst_change variable to this transition time.
if transition_time < now:
last_dst_change = transition_time
else:
break
if last_dst_change:
# Converts the time object to a timestamp e.g. 1710064799
return round((last_dst_change - timedelta()).timestamp(), 2)
else:
logger.error('Failed to fetch last dst change timestamp')
return None
else:
return None
logger.error('Failed to fetch last dst change timestamp')
except Exception as e:
logger.exception(e)
logger.error('Failed to fetch last dst change timestamp')
return None
def get_at_dst_changes_details(logger, session_key):
"""
Fetches the dst_changes details from apply_dst_to_at stanza from itsi_settings.conf
and returns the required data to apply dst changes
@type logger: logger object
@param logger: logger object
@type: string
@param session_key: the splunkd session key for the request
"""
# Fetch data from apply_dst_to_at stanza of itsi_settings.conf
cfm = ConfManager(session_key, 'SA-ITOA')
conf = cfm.get_conf('itsi_settings')
try:
apply_dst_to_at = conf.get('apply_dst_to_at')
disable_dst_to_at = int(apply_dst_to_at.get('disabled', 1))
dst_timezone = apply_dst_to_at.get('timezone', '')
dst_offset = int(apply_dst_to_at.get('offset', 0))
except Exception as e:
logger.exception(e)
logger.error('Failed to fetch dst settings for the threshold calculation')
days_since_dst = 0
dst_change_timestamp = 0
if(not disable_dst_to_at):
if dst_timezone and dst_offset != 0:
# Fetch the last dst change timestamp as per the provided timezone
dst_change_timestamp = last_dst_change_timestamp(logger, dst_timezone)
if dst_change_timestamp:
# Calculate the number of days past dst based on timestamp
last_dst_change_datetime = datetime.utcfromtimestamp(dst_change_timestamp)
current_datetime = datetime.utcnow()
days_since_dst = (current_datetime - last_dst_change_datetime).days
# Disable apply_dst_to_at in itsi_settings.conf if more than 60 days are past since dst changes
if days_since_dst > 60:
try:
conf.update('apply_dst_to_at', {'disabled': 1})
disable_dst_to_at = 1
except Exception as e:
logger.exception(e)
logger.error('Failed to update dst disabled settings for the threshold calculation')
else:
try:
conf.update('apply_dst_to_at', {'disabled': 1})
disable_dst_to_at = 1
except Exception as e:
logger.exception(e)
logger.error('Failed to update dst disabled settings for the threshold calculation')
logger.error('Could not find a timestamp for the provided timezone. Hence DST changes to AT will not be applied')
else:
try:
conf.update('apply_dst_to_at', {'disabled': 1})
disable_dst_to_at = 1
except Exception as e:
logger.exception(e)
logger.error('Failed to update dst disabled settings for the threshold calculation')
logger.error('Could not find a timezone or an offset to apply DST changes to AT. Hence DST changes to AT will not be applied')
return disable_dst_to_at, dst_timezone, dst_change_timestamp, dst_offset
def serialize_digest_data(incremental_value, field):
"""
Serializes tdigest objects to be stored in the incremental values collection
@type: dict
@param: incremental_value: incremental value with tdigest data
@type: string
@param: field: field containing the tdigest data
@rtype: dict
@return: serialized tdigest data
"""
digest = incremental_value[field]
centroids = []
for key in digest.C.keys():
tree_values = digest.C.get_value(key)
centroids.append({'m': tree_values.mean, 'c': tree_values.count})
return {'n': digest.n, 'delta': digest.delta, 'K': digest.K, 'centroids': centroids}
def chunker(params, at_command=False):
logger = params['logger']
kpidict = params['kpidict']
params['outbuf'] = StringIO()
at_incremental_values = []
if at_command:
fields_list = ['policy_id']
for k in range(10):
fields_list.append("threshold_" + str(k))
fields_list.append("threshold_metadata_" + str(k))
fields_list = fields_list + ['itsi_service_id', 'itsi_kpi_id']
if params['entity_level_thresholds']:
fields_list = fields_list + ['entity_key', 'entity_title']
else:
fields_list = ['policy_key', 'itsi_service_id', 'itsi_kpi_id', 'alert_value', '_time']
# prepare for generating output
params['out_metadata']['finished'] = False
# Create a dict writer with IO
params['writer'] = csv.DictWriter(params['outbuf'], fieldnames=fields_list, dialect='excel', extrasaction='ignore')
params['writer'].writeheader()
# Get the service object
params['service_object'] = get_service_object(params)
params['disable_dst_to_at'], params['dst_timezone'], params['dst_change_timestamp'], params['dst_offset'] = \
get_at_dst_changes_details(logger, session_key=params["session_key"])
# Bulk fetch the services of targeted kpis
if params['service_object']:
params['service_object'].bulk_fetch_service(kpidict.keys())
if params['entity_level_thresholds']:
params['entity_threshold_object'] = EntityThreshold(logger=params['logger'])
params['entity_threshold_object'].initialize_interface(
params['session_key'], owner='nobody')
if at_command:
list_kpis = []
for itsi_service_id in kpidict:
for itsi_kpi_id in kpidict[itsi_service_id]:
list_kpis.append(itsi_kpi_id)
# Get the Active Custom Threshold Windows which are of type percentage
ctw_object = CustomThresholdWindow(logger=logger)
ctw_object.initialize_interface(
params['session_key'], owner='nobody')
ctw_linked_kpis = ctw_object.bulk_fetch_active_ctw(list_kpis)
# Phase 2: iterate over (serviceid, kpiid) and output scores
for itsi_service_id in kpidict:
params['kpi'] = {
'service_id': itsi_service_id,
'service_data': None
}
if params['service_object']:
# save the service data
params['kpi']['service_data'] = params['service_object'].fetch_service(itsi_service_id)
for itsi_kpi_id in kpidict[itsi_service_id]:
params['kpi']['kpi_id'] = itsi_kpi_id
# get the KPI object
params['kpi']['kpi_object'] = get_kpi_object(params)
if params['kpi']['kpi_object'] is None:
ignore_invalid_row('No KPI found with id %s, ignoring ...' % itsi_kpi_id, logger)
continue
# get the settings
kpi_tmp = params['kpi']['kpi_object'].get_kpi()
if not isinstance(kpi_tmp, dict):
ignore_invalid_row('No valid KPI found with id %s, ignoring ...' % itsi_kpi_id, logger)
continue
if 'time_variate_thresholds_specification' not in kpi_tmp:
ignore_invalid_row(
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id,
logger
)
continue
params['kpi']['entity_thresholds'] = {}
if params['entity_level_thresholds']:
list_entity_keys = kpidict[itsi_service_id][itsi_kpi_id].keys()
entity_threshold_configs = params['entity_threshold_object'].bulk_fetch_configs(itsi_kpi_id, list_entity_keys, params['pseudo_entities'])
if not entity_threshold_configs:
break
for entity_config in entity_threshold_configs:
# Create temp Entity Key to store persistent entity config in a global object
entity_key = entity_config.get("entity_key") if entity_config.get("entity_key") != 'N/A' else hashlib.md5(( entity_config['entity_title'] + entity_config['kpi_id']).encode("utf-8")).hexdigest()
params['kpi']['entity_thresholds'].update({entity_key: entity_config})
params['kpi']['settings'] = kpi_tmp[
'time_variate_thresholds_specification']
params['kpi']['detect_outliers'] = False
params['kpi']['outlier_detection_algo'] = None
params['kpi']['outlier_multiplier'] = None
params['kpi']['adaptive_thresholding_training_window'] = kpi_tmp['adaptive_thresholding_training_window']
params['kpi']['adaptive_thresholding_copy_kpi_to_entity'] = (
kpi_tmp['adaptive_thresholding_copy_kpi_to_entity']
if 'adaptive_thresholding_copy_kpi_to_entity' in kpi_tmp
else False
)
if 'aggregate_outlier_detection_enabled' in kpi_tmp:
params['kpi']['detect_outliers'] = False if params['entity_level_thresholds'] else kpi_tmp['aggregate_outlier_detection_enabled']
if kpi_tmp['aggregate_outlier_detection_enabled']:
if 'outlier_detection_algo' in kpi_tmp:
params['kpi']['outlier_detection_algo'] = kpi_tmp['outlier_detection_algo']
if 'outlier_detection_sensitivity' in kpi_tmp:
params['kpi']['outlier_multiplier'] = kpi_tmp['outlier_detection_sensitivity']
if at_command:
if kpi_tmp['_key'] in ctw_linked_kpis and kpi_tmp['adaptive_thresholds_is_enabled']:
kpi_tmp['recalculate_custom_thresholds'] = True
if params['kpi']['settings'] is not None:
if params['entity_level_thresholds']:
for entity_key in kpidict[itsi_service_id][itsi_kpi_id]:
# Ignore Entity from data if we don't have configuration available as kpi_entity_threshold
if not params['kpi']['entity_thresholds'].get(entity_key):
continue
schedule = calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id][entity_key], entity_config=params['kpi']['entity_thresholds'][entity_key])
if schedule and schedule.incremental_values:
at_incremental_values.append(schedule.incremental_values)
else:
schedule = calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id], entity_config=None)
if schedule and schedule.incremental_values:
at_incremental_values.append(schedule.incremental_values)
else:
ignore_invalid_row(
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id,
logger
)
continue
if len(at_incremental_values):
at_incremental_values_obj = ItsiAtIncrementalValues(params["session_key"], 'nobody')
at_incremental_values_ = copy.deepcopy(at_incremental_values)
for kpi_ in at_incremental_values_:
for policy in kpi_["policies"]:
if 'incremental_values' in kpi_["policies"][policy]:
for value in kpi_["policies"][policy]["incremental_values"]:
if 'digest' in value:
value["digest"] = serialize_digest_data(value, 'digest')
if 'unfiltered_digest' in value:
value["unfiltered_digest"] = serialize_digest_data(value, 'unfiltered_digest')
at_incremental_values_obj.save_batch('nobody', at_incremental_values_, validate_names=False)
# Write output datain buffer
write_chunk(sys.stdout, params['out_metadata'], params['outbuf'].getvalue())
def calculate_thresholds(at_command, params, data, entity_config=None):
"""
Calculate Thresholds from data dict
@type: dict
@param data: Data points dictionary to parse to get new threshold values
@type: dict
@param entity_config: Entity Level Configuration object. Only for Entity level thresholding
"""
the_schedule = None
if at_command:
# create the schedule
the_schedule = create_schedule(params=params, entity_config=entity_config)
values = clean_values(
data=data,
params=params
)
thresholds = None
if at_command:
# compute the thresholds
if the_schedule is not None:
thresholds = the_schedule.get_thresholds(data=values, params=params, entity_config=entity_config)
else:
thresholds = {}
values['policy_key'] = []
# write output to buffer
output_results(at_command=at_command, params=params, thresholds=thresholds, data=values, entity_config=entity_config)
return the_schedule
def remove_outliers(data, method, multiplier):
if method is None:
method = "stdev"
if multiplier is None:
multiplier = 2
return apply_outlier_algorithm(data, method, multiplier, remove=True)
def apply_outlier_algorithm(data, method, multiplier, remove=False):
if data is None or 'alert_values' not in data:
raise Exception("Data is empty or not in correct format for applying outlier algorithm")
list_of_tuples = data['alert_values']
arr_floats = [float(i[0]) for i in list_of_tuples]
median = statistics.median(arr_floats)
# Calculate bounds
if method.lower() == 'mad':
mad = statistics.median([abs(val - median) for val in arr_floats])
upper_bound = median + (float(multiplier) * mad)
lower_bound = median - (float(multiplier) * mad)
elif method.lower() == 'iqr':
arr_floats = sorted(arr_floats)
iqr = quantile(arr_floats, 0.75) - quantile(arr_floats, 0.25)
upper_bound = median + (float(multiplier) * iqr)
lower_bound = median - (float(multiplier) * iqr)
elif method.lower() == 'stdev':
mean = statistics.mean(arr_floats)
stdev = statistics.stdev(arr_floats)
upper_bound = mean + (float(multiplier) * stdev)
lower_bound = mean - (float(multiplier) * stdev)
else:
raise Exception("Unsupported outlier detection method: %s" % method)
updated_values = []
outliers = []
# Iterate over data and mark outliers
for x in list_of_tuples:
x_list = list(x)
try:
x_list[3] = lower_bound
x_list[4] = upper_bound
except IndexError:
raise Exception(x_list)
if float(x[0]) > upper_bound or float(x[0]) < lower_bound:
x_list[2] = True
outliers.append(tuple(x_list))
if remove:
continue
updated_values.append(tuple(x_list))
data['alert_values'] = updated_values
return outliers, lower_bound, upper_bound
def detect_outliers(params, policy_chunks):
"""
Updates policy_chunks dictionary with outliers detected based on the method chosen
also returns outliers in a separate dictionary
"""
policy_outlier_map = collections.OrderedDict()
logger = params['logger']
# Identify outliers per policy block
for k, v in policy_chunks.items():
data = {'alert_values': v}
outliers, _, _ = apply_outlier_algorithm(data, params['method'].lower(), params['multiplier'])
logger.debug("%s outliers identified for method: %s, multiplier: %s, outliers: %s" % (len(outliers), params['method'], params['multiplier'], outliers))
policy_outlier_map[k] = outliers
policy_chunks[k] = data['alert_values']
return policy_outlier_map