You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1611 lines
76 KiB
1611 lines
76 KiB
# Copyright (C) 2005-2025 Splunk Inc. All Rights Reserved.
|
|
import csv
|
|
import copy
|
|
import collections
|
|
from .custom_threshold_window import CustomThresholdWindow
|
|
from .chunked_util import die, add_message, read_chunk, write_chunk
|
|
from datetime import datetime, timedelta
|
|
from .kpi import KPIBase, ServiceKPI, TempKPI, FileBackedKPI, Service, EntityThreshold
|
|
import logging
|
|
import math
|
|
import pytz
|
|
from pytz.exceptions import UnknownTimeZoneError
|
|
import statistics
|
|
import sys
|
|
from io import StringIO
|
|
import hashlib
|
|
from ITOA.itoa_common import is_feature_enabled
|
|
from ITOA.setup_logging import setup_logging
|
|
from itsi.objects.itsi_at_incremental_values import ItsiAtIncrementalValues
|
|
from itsi.itsi_time_block_utils import PolicyFilter
|
|
from tdigest import TDigest
|
|
|
|
from SA_ITOA_app_common.solnlib.conf_manager import ConfManager
|
|
|
|
##################
|
|
# itsiatutils
|
|
##################
|
|
# Utility module for AT and outlier detection custom search commands.
|
|
|
|
# Windows will mangle our line-endings unless we do this.
|
|
if sys.platform == "win32":
|
|
import os
|
|
import msvcrt
|
|
|
|
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
|
|
msvcrt.setmode(sys.stderr.fileno(), os.O_BINARY)
|
|
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
|
|
|
|
outlier_logger = setup_logging("itsi_apply_at_outliers.log", "itsi.apply_at.outliers", level=logging.DEBUG)
|
|
MIN_DATASET_LEN = 20
|
|
|
|
"""
|
|
Factor used to scale down number of KPIs processed per batch in the itsibatch CSC.
|
|
Limits the amount of KPI time series data needed to be passed to the applyat CSC by
|
|
training window.
|
|
"""
|
|
AT_SCALE_DOWN_FACTORS = {
|
|
'-7d': 1,
|
|
'-14d': 2,
|
|
'-30d': 4,
|
|
'-60d': 8,
|
|
}
|
|
|
|
"""
|
|
Map of the string representation of the adaptive thresholding training window to
|
|
the integer value of the number of days of the training window
|
|
"""
|
|
AT_WINDOW_TO_DAYS_MAP = {
|
|
'-7d': 7,
|
|
'-14d': 14,
|
|
'-30d': 30,
|
|
'-60d': 60,
|
|
}
|
|
|
|
|
|
def log_and_warn(metadata, logger, msg, search_msg=None):
|
|
search_msg = search_msg or msg
|
|
logger.warn(msg)
|
|
add_message(metadata, 'WARN', search_msg)
|
|
|
|
|
|
def log_and_die(metadata, logger, msg, search_msg=None):
|
|
logger.error(msg)
|
|
die(metadata, msg, search_msg)
|
|
|
|
|
|
def generate_at_search(kpi_ids, use_incremental_method=False, log_level='INFO'):
|
|
"""
|
|
Creates the search needed to run KPI level adaptive thresholding
|
|
|
|
@type: list
|
|
@param kpi_ids: the list of kpi_ids
|
|
|
|
@type: bool
|
|
@param use_incremental_method: flag indicating if the incremental method should be applied
|
|
|
|
@type: string
|
|
@param log_level: log_level for applyat command
|
|
"""
|
|
if not isinstance(kpi_ids, list) or len(kpi_ids) < 1:
|
|
return ''
|
|
|
|
incremental_method_flag = 'useincrementalmethod' if use_incremental_method else ''
|
|
|
|
itsi_kpi_ids = 'itsi_kpi_id IN (' + ', '.join(kpi_ids) + ')'
|
|
return '| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE ' \
|
|
'`get_itsi_summary_metrics_index` AND ( ' + itsi_kpi_ids + ' ) AND is_filled_gap_event!=1 ' \
|
|
'AND is_null_alert_value=0 `metrics_service_level_kpi_only` by itsi_kpi_id, ' \
|
|
'itsi_service_id span=1m | where alert_level!=-2 and not isnull(alert_value) | table _time, alert_value, alert_level, ' \
|
|
'itsi_kpi_id, itsi_service_id | applyat ' + incremental_method_flag + ' log_level=' + log_level
|
|
|
|
|
|
def generate_entity_at_search(entity_objects, use_incremental_method=False, log_level='INFO'):
|
|
"""
|
|
Creates the search needed to run entity level adaptive thresholding
|
|
|
|
@type: list
|
|
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
|
|
|
|
@type: bool
|
|
@param use_incremental_method: flag indicating if the incremental method should be applied
|
|
|
|
@type: string
|
|
@param log_level: log_level for applyat command
|
|
|
|
@rtype: string
|
|
@return: SPL search
|
|
"""
|
|
rv = ""
|
|
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
|
|
return rv
|
|
|
|
incremental_method_flag = 'useincrementalmethod' if use_incremental_method else ''
|
|
|
|
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key']
|
|
+ "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
|
|
base_string = """
|
|
| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE
|
|
`get_itsi_summary_metrics_index` AND ( {kpi_filter} ) AND is_filled_gap_event!=1 AND is_null_alert_value=0
|
|
AND `metrics_entity_level_kpi_only` by itsi_kpi_id, itsi_service_id, entity_key, entity_title span=1m
|
|
| where alert_level!=-2 and not isnull(alert_value)
|
|
| table _time, alert_value, alert_level, itsi_kpi_id, itsi_service_id, entity_key, entity_title
|
|
"""
|
|
rv += base_string.format(kpi_filter=kpi_filter_string)
|
|
rv += " | applyat entitylevelthreshold " + incremental_method_flag + " log_level=" + log_level
|
|
return rv
|
|
|
|
|
|
def generate_ml_entity_at_search(entity_objects, kpi_object):
|
|
"""
|
|
Creates the search needed to run entity-level ML-assisted adaptive thresholding
|
|
|
|
@type: list of dict
|
|
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
|
|
|
|
@type: dict
|
|
@param kpi_object: KPI object with entity AT parameters
|
|
|
|
@rtype: string
|
|
@return: SPL search
|
|
"""
|
|
rv = ""
|
|
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
|
|
return rv
|
|
|
|
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key']
|
|
+ "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
|
|
base_string = """
|
|
| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE
|
|
`get_itsi_summary_metrics_index` AND ( {kpi_filter} ) AND is_filled_gap_event!=1 AND is_null_alert_value=0
|
|
AND `metrics_entity_level_kpi_only` by itsi_kpi_id, itsi_service_id, entity_key, entity_title span=1m
|
|
| where alert_level!=-2 and not isnull(alert_value)
|
|
| table _time, alert_value, alert_level, itsi_kpi_id, itsi_service_id, entity_key, entity_title
|
|
"""
|
|
rv += base_string.format(kpi_filter=kpi_filter_string)
|
|
command_string = """
|
|
| recommendthresholdtemplate threshold_direction=auto entity_level_processing=true
|
|
send_to_api=true threshold_direction={threshold_direction} use_static={use_static}
|
|
analysis_window={analysis_window}
|
|
"""
|
|
rv += command_string.format(
|
|
threshold_direction=kpi_object["entity_threshold_direction"],
|
|
use_static=kpi_object["entity_keep_recommended_policy_static"],
|
|
analysis_window=kpi_object["entity_recommendation_training_window"],
|
|
)
|
|
if kpi_object.get("entity_recommendation_allow_negative_value") is not None:
|
|
rv += "non_negative={0} ".format(not kpi_object["entity_recommendation_allow_negative_value"])
|
|
if kpi_object.get("entity_recommendation_threshold_sensitivity") is not None:
|
|
rv += "sensitivity_level={0} ".format(kpi_object["entity_recommendation_threshold_sensitivity"])
|
|
return rv
|
|
|
|
|
|
def generate_ml_entity_at_scout_search(entity_objects):
|
|
"""
|
|
Creates the helper search for identifying entities with data for this KPI
|
|
|
|
@type: list of dict
|
|
@param entity_objects: the list of entity objects having entity_key, entity_title and kpi_id
|
|
|
|
@rtype: string
|
|
@return: SPL search
|
|
"""
|
|
rv = ""
|
|
if not isinstance(entity_objects, list) or len(entity_objects) < 1:
|
|
return rv
|
|
|
|
kpi_filter_string = " OR ".join("(itsi_kpi_id=\"" + x['kpi_id'] + "\" AND entity_key=\"" + x['entity_key']
|
|
+ "\" AND entity_title=\"" + x['entity_title'] + "\")" for x in entity_objects)
|
|
base_string = """
|
|
| mstats latest(alert_value) AS alert_value latest(alert_level) AS alert_level WHERE
|
|
`get_itsi_summary_metrics_index` AND ( {kpi_filter} ) AND is_filled_gap_event!=1 AND is_null_alert_value=0
|
|
AND `metrics_entity_level_kpi_only` by itsi_kpi_id, itsi_service_id, entity_key, entity_title span=1d
|
|
| where alert_level!=-2 and not isnull(alert_value)
|
|
| dedup entity_key, entity_title
|
|
| table _time, alert_value, alert_level, itsi_kpi_id, itsi_service_id, entity_key, entity_title
|
|
"""
|
|
return base_string.format(kpi_filter=kpi_filter_string)
|
|
|
|
|
|
def divide_into_batches(ids, batch_size=1, incremental_method=False):
|
|
"""
|
|
Divides the ids in groups by batch size
|
|
|
|
@type: list
|
|
@param ids: the list of ids
|
|
|
|
@type: int
|
|
@param batch_size: batch size to divide ids into
|
|
|
|
@type: bool
|
|
@param incremental_method: flag indicating if incremental method should be applied
|
|
"""
|
|
for i in range(0, len(ids), batch_size):
|
|
yield [ids[i: i + batch_size], incremental_method]
|
|
|
|
|
|
def generate_searches(batches):
|
|
"""
|
|
Generate the SPL needed to run adaptive thresholding searches
|
|
|
|
@type: list
|
|
@param batches: the groups of ids ne
|
|
"""
|
|
for batch in batches:
|
|
yield generate_at_search(batch)
|
|
|
|
|
|
def quantile(data, q):
|
|
"""Naive implementation of linear-interpolated quantile.
|
|
|
|
Comparable to numpy.percentile()/pd.DataFrame.quantile().
|
|
Author: Jacob Leverich (jleverich@splunk.com)
|
|
"""
|
|
assert q >= 0. and q <= 1.
|
|
m = float(len(data) - 1)
|
|
i = m * q
|
|
|
|
ilow = math.floor(i)
|
|
ihigh = math.ceil(i)
|
|
if ilow == ihigh:
|
|
return data[int(ilow)]
|
|
|
|
f = (i - ilow) / (ihigh - ilow)
|
|
low = data[int(ilow)]
|
|
high = data[int(ihigh)]
|
|
return low + f * (high - low)
|
|
|
|
|
|
def quantiles(data, levels):
|
|
# Remove nan's if any in the data
|
|
data = [x for x in data if not math.isnan(x)]
|
|
data = sorted(data)
|
|
out = {x: quantile(data, float(x)) for x in levels}
|
|
return out
|
|
|
|
|
|
def parse_input_data(the_dict, data, fields_list, params):
|
|
"""
|
|
Populates the_dict with the values in data keyed by the fields in fields_list.
|
|
|
|
@param the_dict: dict keyed by service_id and then by kpi_id into which we will write the data
|
|
@param data: the incoming event data
|
|
@param fields_list: list of strings containing the field names to be added as data to the appropriate list in the_dict
|
|
@param params: Contains keys 'logger', 'use_kv_store', 'out_metadata', and 'kpi', the last of which contains 'service_id' and 'kpi_id'
|
|
"""
|
|
use_kv_store = params['use_kv_store']
|
|
logger = params['logger']
|
|
reader = csv.DictReader(data.splitlines(), dialect='excel')
|
|
|
|
for record in reader:
|
|
if 'itsi_service_id' not in record:
|
|
if not use_kv_store:
|
|
log_and_warn(metadata=params[
|
|
'out_metadata'], logger=logger, msg="Missing Service ID: %s. Generating dummy value." % repr(record))
|
|
record['itsi_service_id'] = 'DEFAULT_SERVICE_ID'
|
|
if 'itsi_kpi_id' not in record:
|
|
if not use_kv_store:
|
|
log_and_warn(metadata=params[
|
|
'out_metadata'], logger=logger, msg="Missing KPI ID: %s. Generating dummy value." % repr(record))
|
|
record['itsi_kpi_id'] = 'DEFAULT_KPI_ID'
|
|
if params['entity_level_thresholds'] and 'entity_title' not in record:
|
|
if not use_kv_store:
|
|
log_and_warn(metadata=params[
|
|
'out_metadata'], logger=logger, msg="Missing Entity Title: %s. Generating dummy value." % repr(record))
|
|
record['entity_title'] = 'DEFAULT_ENTITY_TITLE'
|
|
|
|
for f in fields_list:
|
|
if record[f] == '' and f != 'itsi_service_id' and f != 'itsi_kpi_id' and f != 'entity_title' and f != 'entity_key':
|
|
log_and_die(
|
|
metadata=params['out_metadata'], logger=logger, msg="Missing field %s at time %s" % (str(f), str(record['_time'])))
|
|
itsi_service_id = record['itsi_service_id']
|
|
itsi_kpi_id = record['itsi_kpi_id']
|
|
if itsi_service_id not in the_dict:
|
|
the_dict[itsi_service_id] = dict()
|
|
if itsi_kpi_id not in the_dict[itsi_service_id]:
|
|
the_dict[itsi_service_id][itsi_kpi_id] = dict()
|
|
if not params['entity_level_thresholds']:
|
|
tmpdict = {}
|
|
for f in fields_list:
|
|
tmpdict[f] = list()
|
|
the_dict[record['itsi_service_id']][record['itsi_kpi_id']] = tmpdict
|
|
|
|
if params['entity_level_thresholds']:
|
|
itsi_entity_key = record.get('entity_key', "N/A")
|
|
if itsi_entity_key == "N/A":
|
|
itsi_entity_key = hashlib.md5((record['entity_title'] + itsi_kpi_id).encode("utf-8")).hexdigest()
|
|
params['pseudo_entities'].update({itsi_entity_key: record['entity_title']})
|
|
if itsi_entity_key not in the_dict[itsi_service_id][itsi_kpi_id]:
|
|
tmpdict = {}
|
|
for f in fields_list:
|
|
tmpdict[f] = list()
|
|
the_dict[record['itsi_service_id']][record['itsi_kpi_id']][itsi_entity_key] = tmpdict
|
|
currentdict = the_dict[itsi_service_id][itsi_kpi_id][itsi_entity_key]
|
|
else:
|
|
currentdict = the_dict[itsi_service_id][itsi_kpi_id]
|
|
|
|
for f in fields_list:
|
|
currentdict[f].append(record[f])
|
|
|
|
|
|
def drop_dup(data, index):
|
|
"""Naive re-implementation of pd.DataFrame.drop_duplicates()"""
|
|
out_data = {k: [] for k in list(data.keys())}
|
|
last = None
|
|
for i, v in enumerate(data[index]):
|
|
if v != last:
|
|
for k in list(data.keys()):
|
|
out_data[k].append(data[k][i])
|
|
last = v
|
|
|
|
return out_data
|
|
|
|
|
|
def clean_values(data, params):
|
|
"""Non-pandas replacement for atad_utils.create_dataframe().
|
|
|
|
@param data: dict of '_time': list(epoch timestamp strings)
|
|
'alert_value': list(float strings)
|
|
'alert_period': list(float strings) optional?
|
|
@param params: dict with keys 'logger' and 'out_metadata'
|
|
"""
|
|
logger = params['logger']
|
|
metadata = params['out_metadata']
|
|
|
|
values = dict(data)
|
|
|
|
for i in range(len(values['_time'])):
|
|
try:
|
|
values['_time'][i] = float(values['_time'][i])
|
|
except ValueError:
|
|
log_and_warn(metadata, logger, "Can't parse _time '%s' as float" % values['_time'][i])
|
|
values['_time'][i] = float('nan')
|
|
|
|
# Drop duplicates
|
|
values = drop_dup(values, '_time')
|
|
|
|
for i in range(len(values['alert_value'])):
|
|
try:
|
|
values['alert_value'][i] = float(values['alert_value'][i])
|
|
except ValueError:
|
|
log_and_warn(metadata, logger, "Can't parse alert_value '%s' as float" % values['alert_value'][i])
|
|
values['alert_value'][i] = float('nan')
|
|
|
|
if 'alert_period' in values:
|
|
for i in range(len(values['alert_period'])):
|
|
try:
|
|
values['alert_period'][i] = float(values['alert_period'][i])
|
|
except ValueError:
|
|
log_and_warn(metadata, logger, "Can't parse alert_period '%s' as float" % values['alert_period'][i])
|
|
values['alert_period'][i] = float('nan')
|
|
|
|
return values
|
|
|
|
|
|
def get_service_object(params):
|
|
service_object = None
|
|
|
|
if params['use_kv_store'] and not params['use_temp_collection']:
|
|
service_object = Service(logger=params['logger'])
|
|
service_object.initialize_interface(
|
|
params['session_key'], owner='nobody')
|
|
return service_object
|
|
|
|
|
|
def get_kpi_object(params):
|
|
kpi_object = None
|
|
|
|
if params['use_kv_store']:
|
|
if params['use_temp_collection'] and params['temp_collection'] is not None and params['temp_key'] is not None:
|
|
kpi_object = TempKPI(logger=params['logger'], temp_collection_name=params['temp_collection'], temp_object_key=params['temp_key'])
|
|
else:
|
|
kpi_object = ServiceKPI(
|
|
logger=params['logger'], service_data=params['kpi']['service_data'], kpi_id=params['kpi']['kpi_id'])
|
|
|
|
kpi_object.initialize_interface(
|
|
params['session_key'], owner='nobody', namespace='SA-ITOA')
|
|
kpi_object.fetch_kpi()
|
|
params['logger'].debug(
|
|
"Initialized KV interface with session key %s" % params['session_key'])
|
|
elif params['settings_file'] is not None:
|
|
kpi_object = FileBackedKPI(
|
|
logger=params['logger'], filename=params['settings_file'])
|
|
|
|
return kpi_object
|
|
|
|
|
|
# Policy Class
|
|
class Policy(object):
|
|
|
|
def __init__(self, key, method, parameters, at_run_params, **kwargs):
|
|
# validate methods and parameters
|
|
if not isinstance(key, str):
|
|
raise ValueError(
|
|
"Null or non-string key sent to Policy constructor.")
|
|
if not isinstance(method, str):
|
|
raise ValueError(
|
|
"Null or non-string method sent to Policy constructor. Must be a string: stdev, quantile, range, or percentage.")
|
|
method_str = str(method)
|
|
if method_str not in ['stdev', 'quantile', 'range', 'percentage']:
|
|
raise ValueError(
|
|
"Method must be one of stdev, quantile, range, or percentage.")
|
|
if not parameters: # parameters is a list of theshold levels
|
|
raise ValueError("Null parameters sent to Policy constructor.")
|
|
if not isinstance(parameters, list) or len(parameters) > 10:
|
|
raise ValueError(
|
|
"Parameters must be a list of no more than 10 levels.", parameters)
|
|
if not all('dynamicParam' in x for x in parameters):
|
|
raise ValueError("Every level record must have a dynamicParam attribute")
|
|
|
|
# store policies in form amenable to computing thresholds
|
|
self.key = key
|
|
self.method = method_str
|
|
self.parameters = parameters
|
|
self.title = kwargs.get('title', key)
|
|
self.logger = kwargs.get('logger')
|
|
self.at_run_params = at_run_params
|
|
|
|
@property
|
|
def parameter_values(self):
|
|
# property that extracts dynamic param values from parameter list
|
|
return [float(x['dynamicParam']) for x in self.parameters]
|
|
|
|
def get_updated_levels(self, computed_thresholds, kpi_id, service_id):
|
|
"""
|
|
Returns a copy of the levels structure stored in self.parameters
|
|
where thresholdValue field is updated from the computed levels array
|
|
"""
|
|
if len(computed_thresholds) != len(self.parameters):
|
|
raise ValueError("Computed thresholds and stored thresholds structures are not of the same length")
|
|
result = []
|
|
for computed_value, level in zip(computed_thresholds, self.parameters):
|
|
level_copy = copy.copy(level)
|
|
level_copy['thresholdValue'] = computed_value
|
|
result.append(level_copy)
|
|
self.logger.debug("Calculated thresholdLevels for policy %s of kpi %s and service %s are %s", self.key, kpi_id, service_id, result)
|
|
return result
|
|
|
|
def update_outlier_incremental_value(self, value_to_update, alert_value, outlier_method):
|
|
"""
|
|
Updates the incremental value appropriately based on the alert
|
|
value and policy type with data for use in outlier exclusion calculation
|
|
|
|
@type: dict
|
|
@param value_to_update: incremental values object to apply updates on
|
|
|
|
@type: float
|
|
@param alert_value: KPI alert value to update incremental values with
|
|
|
|
@type: string
|
|
@param: outlier_method: outlier exclusion algorithm method of the KPI
|
|
"""
|
|
if isinstance(alert_value, (int, float)):
|
|
if outlier_method == 'stdev':
|
|
if 'unfiltered_sum' in value_to_update:
|
|
value_to_update['unfiltered_sum'] += alert_value
|
|
value_to_update['unfiltered_count'] += 1
|
|
value_to_update['unfiltered_sum_of_squares'] += alert_value ** 2
|
|
else:
|
|
value_to_update['unfiltered_sum'] = alert_value
|
|
value_to_update['unfiltered_count'] = 1
|
|
value_to_update['unfiltered_sum_of_squares'] = alert_value ** 2
|
|
elif outlier_method in ('mad', 'iqr'):
|
|
if 'unfiltered_digest' not in value_to_update:
|
|
value_to_update['unfiltered_digest'] = TDigest()
|
|
value_to_update['unfiltered_digest'].update(alert_value)
|
|
else:
|
|
raise Exception("Unsupported outlier detection method: %s" % outlier_method)
|
|
|
|
def update_incremental_value(self, value_to_update, alert_value):
|
|
"""
|
|
Updates the incremental value appropriately based on the alert
|
|
value and policy type
|
|
|
|
@type: dict
|
|
@param value_to_update: incremental values object to apply updates on
|
|
|
|
@type: float
|
|
@param alert_value: KPI alert value to update incremental values with
|
|
"""
|
|
if isinstance(alert_value, (int, float)):
|
|
if self.method == 'stdev':
|
|
if 'sum' in value_to_update:
|
|
value_to_update['sum'] += alert_value
|
|
value_to_update['sum_of_squares'] += alert_value ** 2
|
|
value_to_update['count'] += 1
|
|
else:
|
|
value_to_update['sum'] = alert_value
|
|
value_to_update['sum_of_squares'] = alert_value ** 2
|
|
value_to_update['count'] = 1
|
|
elif self.method == 'range':
|
|
if 'min' in value_to_update:
|
|
value_to_update['min'] = min(value_to_update['min'], alert_value)
|
|
value_to_update['max'] = max(value_to_update['max'], alert_value)
|
|
value_to_update['count'] += 1
|
|
else:
|
|
value_to_update['min'] = alert_value
|
|
value_to_update['max'] = alert_value
|
|
value_to_update['count'] = 1
|
|
elif self.method == 'quantile':
|
|
if 'digest' not in value_to_update:
|
|
value_to_update['digest'] = TDigest()
|
|
value_to_update['digest'].update(alert_value)
|
|
elif self.method == 'percentage':
|
|
if 'sum' in value_to_update:
|
|
value_to_update['sum'] += alert_value
|
|
value_to_update['count'] += 1
|
|
else:
|
|
value_to_update['sum'] = alert_value
|
|
value_to_update['count'] = 1
|
|
|
|
def apply_outlier_algorithm_incremental(self, method, multiplier, incremental_values, kpi_values):
|
|
"""
|
|
Calculates and marks outliers in the KPI data based on the outlier exclusion algorithm method
|
|
and multiplier
|
|
|
|
@type: string
|
|
@param method: outlier exclusion algorithm method of the KPI
|
|
|
|
@type: float
|
|
@param multiplier: the sensitivity multiplier of the outlier exclusion algoirthm
|
|
|
|
@type: list
|
|
@param: incremental_values: incremental values containing aggregate KPI data
|
|
|
|
@type: list
|
|
@param: kpi_values: kpi value data [timestamp, value, outlier flag, lower bound, upper bound]
|
|
|
|
@return: a list containing update KPI data values, outlier count, and bounds data
|
|
"""
|
|
if method is None:
|
|
method = "stdev"
|
|
if multiplier is None:
|
|
multiplier = 2
|
|
kpi_values_copy = [value for _, value, _, _, _ in kpi_values]
|
|
lower_bound, upper_bound = None, None
|
|
if method == 'mad':
|
|
digest = TDigest()
|
|
for value in incremental_values:
|
|
if 'unfiltered_digest' in value:
|
|
digest += value['unfiltered_digest']
|
|
if digest.n > 0:
|
|
median = digest.percentile(50)
|
|
mad = statistics.median([abs(val - median) for val in kpi_values_copy])
|
|
upper_bound = median + (float(multiplier) * mad)
|
|
lower_bound = median - (float(multiplier) * mad)
|
|
elif method == 'iqr':
|
|
digest = TDigest()
|
|
for value in incremental_values:
|
|
if 'unfiltered_digest' in value:
|
|
digest += value['unfiltered_digest']
|
|
if digest.n > 0:
|
|
median = digest.percentile(50)
|
|
iqr = digest.percentile(75) - digest.percentile(25)
|
|
upper_bound = iqr + (float(multiplier) * median)
|
|
lower_bound = iqr - (float(multiplier) * median)
|
|
elif method == 'stdev':
|
|
total_count, total_sum, total_sum_of_squares = 0, 0, 0
|
|
for value in incremental_values:
|
|
if 'unfiltered_sum' in value:
|
|
total_sum += value['unfiltered_sum']
|
|
total_count += value['unfiltered_count']
|
|
total_sum_of_squares += value['unfiltered_sum_of_squares']
|
|
if total_count > 0:
|
|
mean = total_sum / total_count
|
|
variance = (total_sum_of_squares - 2 * mean * total_sum + total_count * mean ** 2) / total_count
|
|
stdev = math.sqrt(variance)
|
|
upper_bound = mean + (float(multiplier) * stdev)
|
|
lower_bound = mean - (float(multiplier) * stdev)
|
|
else:
|
|
raise Exception("Unsupported outlier detection method: %s" % method)
|
|
|
|
updated_values = []
|
|
outlier_count = 0
|
|
if lower_bound is not None and upper_bound is not None:
|
|
for value in kpi_values:
|
|
value = list(value)
|
|
is_outlier = lower_bound > value[0] or value[0] > upper_bound
|
|
value[2], value[3], value[4] = is_outlier, lower_bound, upper_bound
|
|
if is_outlier:
|
|
outlier_count += 1
|
|
updated_values.append(tuple(value))
|
|
return updated_values, outlier_count, lower_bound, upper_bound
|
|
|
|
def cycle_incremental_values(self, values_to_update, new_values, params, entity_config={}):
|
|
"""
|
|
Performs the cycling process of the incremental values, adding a new value for
|
|
the past 24 hours and removing any expired values outside of the training window
|
|
|
|
@type: list
|
|
@param: values_to_update: list of incremental values to update
|
|
|
|
@type: list
|
|
@param: new_values: kpi value data [timestamp, value, outlier flag, lower bound, upper bound]
|
|
|
|
@type: dict
|
|
@param params: applyat custom search command parameters
|
|
|
|
@type: object
|
|
@param entity_config: configuration object for entity level thresholding
|
|
"""
|
|
now = datetime.now()
|
|
try:
|
|
if params['dst_timezone']:
|
|
tz = pytz.timezone(params['dst_timezone'])
|
|
now = datetime.now(tz)
|
|
except UnknownTimeZoneError as e:
|
|
self.logger.exception(e)
|
|
self.logger.error('Found Unknown timezone')
|
|
updated_data_values = new_values
|
|
outlier_method = None
|
|
if (not entity_config and params['kpi']['detect_outliers']) or \
|
|
(entity_config and 'aggregate_outlier_detection_enabled' in entity_config and entity_config['aggregate_outlier_detection_enabled']):
|
|
outlier_method = params['kpi']['outlier_detection_algo'] if not entity_config else entity_config['outlier_detection_algo']
|
|
outlier_multiplier = params['kpi']['outlier_multiplier'] if not entity_config else entity_config['outlier_detection_sensitivity']
|
|
outliers, count, lower_bound, upper_bound = self.apply_outlier_algorithm_incremental(outlier_method, outlier_multiplier, values_to_update, new_values)
|
|
updated_data_values = outliers
|
|
outlier_dict = {
|
|
'kpi_id': params['kpi']['kpi_id'],
|
|
'service_id': self.at_run_params['service_id'],
|
|
'policy_key': self.key,
|
|
'training_window': self.at_run_params['training_window'],
|
|
'at_run_epoch': self.at_run_params['at_run_epoch'],
|
|
'use_temp_collection': self.at_run_params['use_temp_collection'],
|
|
'method': outlier_method,
|
|
'multiplier': outlier_multiplier,
|
|
'count': count,
|
|
'lower_bound': lower_bound,
|
|
'upper_bound': upper_bound
|
|
}
|
|
outlier_logger.info(outlier_dict)
|
|
self.logger.info("KPI: %s, %s outliers identified and removed" % (params['kpi']['kpi_id'], count))
|
|
training_window = self.at_run_params['training_window']
|
|
last_day = now - timedelta(days=1)
|
|
start_of_day = last_day.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
|
|
if start_of_day != values_to_update[-1]['timestamp']:
|
|
new_incremental_value = { 'timestamp': start_of_day }
|
|
for data in updated_data_values:
|
|
alert_value = data[0]
|
|
timestamp = data[1]
|
|
if outlier_method and timestamp >= start_of_day:
|
|
self.update_outlier_incremental_value(new_incremental_value, alert_value, outlier_method)
|
|
if data[2]:
|
|
continue
|
|
if timestamp >= start_of_day:
|
|
self.update_incremental_value(new_incremental_value, alert_value)
|
|
values_to_update.append(new_incremental_value)
|
|
training_window_diff = len(values_to_update) - AT_WINDOW_TO_DAYS_MAP[training_window]
|
|
if training_window_diff > 0:
|
|
del values_to_update[:training_window_diff]
|
|
|
|
def get_incremental_values(self, values, params, entity_config={}):
|
|
"""
|
|
Returns a list of incremental values split by days of the training window
|
|
with calculations based on the policy type
|
|
"""
|
|
now = datetime.now()
|
|
try:
|
|
if params['dst_timezone']:
|
|
tz = pytz.timezone(params['dst_timezone'])
|
|
now = datetime.now(tz)
|
|
except UnknownTimeZoneError as e:
|
|
self.logger.exception(e)
|
|
self.logger.error('Found Unknown timezone')
|
|
training_window = self.at_run_params['training_window']
|
|
incremental_values = []
|
|
for days_ago in range(AT_WINDOW_TO_DAYS_MAP[training_window]):
|
|
day = now - timedelta(days=days_ago + 1)
|
|
start_of_day = day.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
|
|
incremental_values.insert(0, { 'timestamp': start_of_day })
|
|
if len(values) < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points to calculate thresholds in policy: %s, values: %s" % (MIN_DATASET_LEN, self.key, values))
|
|
return incremental_values
|
|
cur_index = 0
|
|
next_timestamp = incremental_values[cur_index + 1]['timestamp']
|
|
outlier_method = None
|
|
updated_values = values
|
|
if not entity_config and params['kpi']['detect_outliers']:
|
|
alert_value_data = {'alert_values': values}
|
|
outlier_method = params['kpi']['outlier_detection_algo']
|
|
outlier_multiplier = params['kpi']['outlier_multiplier']
|
|
apply_outlier_algorithm(alert_value_data, outlier_method, outlier_multiplier)
|
|
updated_values = alert_value_data['alert_values']
|
|
elif entity_config and 'aggregate_outlier_detection_enabled' in entity_config and entity_config['aggregate_outlier_detection_enabled']:
|
|
alert_value_data = {'alert_values': values}
|
|
outlier_method = entity_config['outlier_detection_algo']
|
|
outlier_multiplier = entity_config['outlier_detection_sensitivity']
|
|
apply_outlier_algorithm(alert_value_data, outlier_method, outlier_multiplier)
|
|
updated_values = alert_value_data['alert_values']
|
|
for data in updated_values:
|
|
alert_value = data[0]
|
|
timestamp = data[1]
|
|
if timestamp >= next_timestamp:
|
|
cur_index += 1
|
|
if cur_index + 1 >= AT_WINDOW_TO_DAYS_MAP[training_window]:
|
|
next_timestamp = now.timestamp()
|
|
else:
|
|
next_timestamp = incremental_values[cur_index + 1]['timestamp']
|
|
if outlier_method and timestamp >= incremental_values[cur_index]['timestamp']:
|
|
self.update_outlier_incremental_value(incremental_values[cur_index], alert_value, outlier_method)
|
|
if data[2]:
|
|
continue
|
|
if timestamp >= incremental_values[cur_index]['timestamp']:
|
|
self.update_incremental_value(incremental_values[cur_index], alert_value)
|
|
return incremental_values
|
|
|
|
# returns a copy of threshold levels structure with thresholdValue field updated
|
|
def get_thresholds(self, values, kpi_dict, use_incremental_method=False, entity_config=False):
|
|
if self.method is None:
|
|
raise UnboundLocalError("No method set for Policy.")
|
|
|
|
if not use_incremental_method:
|
|
data = {'alert_values': values}
|
|
if len(values) < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points to calculate thresholds in policy: %s, values: %s" % (MIN_DATASET_LEN, self.key, values))
|
|
return None
|
|
if (not entity_config and kpi_dict['detect_outliers']) or \
|
|
(entity_config and 'aggregate_outlier_detection_enabled' in entity_config and entity_config['aggregate_outlier_detection_enabled']):
|
|
outlier_method = kpi_dict['outlier_detection_algo'] if not entity_config else entity_config['outlier_detection_algo']
|
|
outlier_multiplier = kpi_dict['outlier_multiplier'] if not entity_config else entity_config['outlier_detection_sensitivity']
|
|
outliers, lower_bound, upper_bound = remove_outliers(data, outlier_method, outlier_multiplier)
|
|
outlier_dict = {
|
|
'kpi_id': kpi_dict['kpi_id'],
|
|
'service_id': self.at_run_params['service_id'],
|
|
'policy_key': self.key,
|
|
'training_window': self.at_run_params['training_window'],
|
|
'at_run_epoch': self.at_run_params['at_run_epoch'],
|
|
'use_temp_collection': self.at_run_params['use_temp_collection'],
|
|
'method': outlier_method,
|
|
'multiplier': outlier_multiplier,
|
|
'count': len(outliers),
|
|
'lower_bound': lower_bound,
|
|
'upper_bound': upper_bound
|
|
}
|
|
# Write outliers metadata to outlier log.
|
|
outlier_logger.info(outlier_dict)
|
|
self.logger.info("KPI: %s, %s outliers identified and removed: %s" % (kpi_dict['kpi_id'], len(outliers), outliers))
|
|
filtered_values = data['alert_values']
|
|
D = {'alert_values': [v[0] for v in filtered_values if not math.isnan(v[0])]}
|
|
if len(D['alert_values']) < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points in policy: %s, %s" % (MIN_DATASET_LEN, self.key, values))
|
|
return None
|
|
|
|
if self.method == 'stdev': # pretty standard, really
|
|
if use_incremental_method:
|
|
total_count = 0
|
|
total_sum = 0
|
|
total_sum_squares = 0
|
|
for value in values:
|
|
if 'sum' in value:
|
|
total_count += value['count']
|
|
total_sum += value['sum']
|
|
total_sum_squares += value['sum_of_squares']
|
|
if total_count < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points for stdev to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
|
|
return None
|
|
mean = total_sum / total_count
|
|
variance = (total_sum_squares - 2 * mean * total_sum + total_count * (mean ** 2)) / total_count
|
|
std = math.sqrt(variance)
|
|
else:
|
|
# Simple two-pass algorithm for calculating stdev. Reasonably numerically stable.
|
|
mean = sum(D['alert_values']) / len(D['alert_values'])
|
|
sqe = sum((x - mean) ** 2. for x in D['alert_values'])
|
|
std = math.sqrt(sqe / (len(D['alert_values']) - 1))
|
|
if std == 0.0:
|
|
# Very rare scenario when all the alert values are the same,
|
|
# setting it to a non-zero value based on a heuristic.
|
|
self.logger.info("STD evaluated as 0, setting it to a non-zero value.")
|
|
std = mean * 0.001 + 0.001 # 1000th of the mean
|
|
return self.get_updated_levels([mean + (std * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
|
|
# formerly iqr and same as "mass" in prior iterations
|
|
elif self.method == 'quantile':
|
|
if use_incremental_method:
|
|
digest = TDigest()
|
|
for value in values:
|
|
if 'digest' in value:
|
|
digest += value['digest']
|
|
if digest.n < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points for range to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
|
|
return None
|
|
T = {x: digest.percentile(x * 100) for x in self.parameter_values}
|
|
else:
|
|
T = quantiles(D['alert_values'], self.parameter_values)
|
|
return self.get_updated_levels([T[k] for k in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
|
|
elif self.method == 'range': # equal width bands
|
|
if use_incremental_method:
|
|
dmax = -math.inf
|
|
dmin = math.inf
|
|
total_count = 0
|
|
for value in values:
|
|
if 'min' in value:
|
|
dmax = max(dmax, value['max'])
|
|
dmin = min(dmin, value['min'])
|
|
total_count += value['count']
|
|
if total_count < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points for range to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
|
|
return None
|
|
else:
|
|
dmax = max(D['alert_values'])
|
|
dmin = min(D['alert_values'])
|
|
span = dmax - dmin
|
|
return self.get_updated_levels([dmin + (span * c) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
|
|
elif self.method == 'percentage':
|
|
if use_incremental_method:
|
|
total_sum = 0
|
|
total_count = 0
|
|
for value in values:
|
|
if 'sum' in value:
|
|
total_sum += value['sum']
|
|
total_count += value['count']
|
|
if total_count < MIN_DATASET_LEN:
|
|
self.logger.error("There are less than %s data points for percentage to calculate thresholds in policy: %s" % (MIN_DATASET_LEN, self.key))
|
|
return None
|
|
mean = total_sum / total_count
|
|
else:
|
|
# Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage
|
|
mean = sum(D['alert_values']) / len(D['alert_values'])
|
|
return self.get_updated_levels([mean * (1 + c / 100) for c in self.parameter_values], kpi_dict['kpi_id'], self.at_run_params['service_id'])
|
|
else:
|
|
raise ValueError("Invalid thresholding method: " + self.method)
|
|
|
|
|
|
# Schedule Class
|
|
class Schedule(object):
|
|
# policies: dict of Policy Objects keyed by policy.key
|
|
# schedule: dict of policy_keys keyed by block_keys
|
|
|
|
def __init__(self, kpi_object, policies, threshold_spec, params):
|
|
# validate kpi
|
|
if kpi_object is None:
|
|
raise ValueError("Null KPI object sent to Schedule constructor.")
|
|
if not isinstance(kpi_object, KPIBase):
|
|
raise ValueError("KPI parameter must be a kpi.KPI object")
|
|
# validate policies
|
|
if policies is None:
|
|
raise ValueError("Null policy dict sent to Schedule constructor.")
|
|
if not isinstance(policies, dict):
|
|
raise ValueError(
|
|
"Policies parameter must be a dict, got %s." % type(policies))
|
|
if len(policies) > 169 or len(policies) == 0:
|
|
raise ValueError(
|
|
"Policies parameter must be a dict of no more than 168 Policy objects, got %s." % len(policies))
|
|
if sum([1 if not isinstance(p, Policy) else 0 for p in list(policies.values())]) > 0:
|
|
raise ValueError("All policies must be Policy objects.")
|
|
self.logger = None
|
|
if 'logger' in params:
|
|
self.logger = params['logger']
|
|
|
|
self.kpi_object = kpi_object
|
|
self.policies = policies
|
|
self.filter = PolicyFilter(threshold_spec)
|
|
self.incremental_values = None
|
|
|
|
def _parse_digest_data(self, incremental_value, field):
|
|
"""
|
|
Parses the serialized tdigest data stored in the incremental value and
|
|
creates a tdigest object from it
|
|
|
|
@type: dict
|
|
@param: incremental_value: incremental value with serialized tdigest data
|
|
|
|
@type: string
|
|
@param: field: field containing the tdigest data
|
|
|
|
@rtype: object
|
|
@return: parsed tdigest object
|
|
"""
|
|
digest = TDigest()
|
|
digest.K = incremental_value[field].get('K')
|
|
digest.delta = incremental_value[field].get('delta')
|
|
[digest.update(value['m'], value['c']) for value in incremental_value[field].get('centroids')]
|
|
return digest
|
|
|
|
def _get_thresholds(self, data, params, entity_config={}):
|
|
if data is None:
|
|
raise ValueError("Null data sent to Schedule.")
|
|
if not isinstance(data, dict) or 'alert_value' not in data:
|
|
raise ValueError(
|
|
"Data passed to Schedule must be a dict with values in column 'alert_values'." + str(data))
|
|
data_key = params['kpi']['kpi_id'] if not entity_config else entity_config['_key']
|
|
if params['use_incremental_method']:
|
|
at_incremental_values_obj = ItsiAtIncrementalValues(params["session_key"], 'nobody')
|
|
kpi_at_incremental_values = at_incremental_values_obj.get_kpi_at_incremental_values('nobody', data_key)
|
|
for policy in kpi_at_incremental_values['policies']:
|
|
for value in kpi_at_incremental_values['policies'][policy]['incremental_values']:
|
|
if 'digest' in value:
|
|
value['digest'] = self._parse_digest_data(value, 'digest')
|
|
if 'unfiltered_digest' in value:
|
|
value['unfiltered_digest'] = self._parse_digest_data(value, 'unfiltered_digest')
|
|
elif params['incremental_learning_enabled']:
|
|
kpi_at_incremental_values = {
|
|
'_key': data_key,
|
|
'policies': {}
|
|
}
|
|
|
|
# divide data based on policy: D[policy_key] = [tuples]
|
|
D = {}
|
|
for policy_key in self.policies:
|
|
D[policy_key] = []
|
|
index_converted = data['_time']
|
|
active_policies = set()
|
|
for data_index in range(len(index_converted)):
|
|
# If the apply_dst_to_at is enabled then shift add timestamp to the dst offset
|
|
if not params["disable_dst_to_at"] and params["dst_change_timestamp"] > 0 and params["dst_offset"] != 0:
|
|
# If _time is less than the last dst_change_timestamp then add it with the dst_offset
|
|
if index_converted[data_index] < params["dst_change_timestamp"]:
|
|
index_converted[data_index] = index_converted[data_index] + params["dst_offset"]
|
|
|
|
# provide a timestamp and TZ, get the policy that includes this timestamp
|
|
policy_key = self.filter.get_policy_key(time=index_converted[data_index])
|
|
if policy_key in D:
|
|
D[policy_key].append((data['alert_value'][data_index], index_converted[data_index], False, 0, 0))
|
|
active_policies.add(policy_key)
|
|
# compute and accumulate the thresholds for each Policy
|
|
T = {}
|
|
insufficient_data_policies = []
|
|
should_create_incremental_values = True
|
|
for policy_key in self.policies:
|
|
the_data = D[policy_key]
|
|
if params['use_incremental_method']:
|
|
self.policies[policy_key].cycle_incremental_values(kpi_at_incremental_values['policies'][policy_key]['incremental_values'], the_data, params, entity_config)
|
|
T[policy_key] = self.policies[policy_key].get_thresholds(kpi_at_incremental_values['policies'][policy_key]['incremental_values'], params['kpi'], True, entity_config)
|
|
self.incremental_values = kpi_at_incremental_values
|
|
elif not entity_config:
|
|
policy_type = params['kpi']['settings']['policies'][policy_key]['policy_type']
|
|
time_blocks = params['kpi']['settings']['policies'][policy_key]['time_blocks']
|
|
if params['incremental_learning_enabled'] and should_create_incremental_values and policy_type in ('stdev', 'range', 'percentage', 'quantile'):
|
|
dynamic_params = [{'severityValue': tl['severityValue'], 'dynamicParam': tl['dynamicParam']} for tl in params['kpi']['settings']['policies'][policy_key]['aggregate_thresholds']['thresholdLevels']]
|
|
kpi_at_incremental_values['policies'][policy_key] = { 'policy_type': policy_type, 'time_blocks': time_blocks, 'dynamic_params': dynamic_params }
|
|
kpi_at_incremental_values['policies'][policy_key]['incremental_values'] = self.policies[policy_key].get_incremental_values(the_data, params)
|
|
kpi_at_incremental_values["aggregate_outlier_detection_enabled"] = params['kpi']['detect_outliers']
|
|
kpi_at_incremental_values["outlier_detection_algo"] = params['kpi']['outlier_detection_algo']
|
|
kpi_at_incremental_values["outlier_detection_sensitivity"] = params['kpi']['outlier_multiplier']
|
|
kpi_at_incremental_values["adaptive_thresholding_training_window"] = params['kpi']['adaptive_thresholding_training_window']
|
|
self.incremental_values = kpi_at_incremental_values
|
|
T[policy_key] = self.policies[policy_key].get_thresholds(the_data, params['kpi'])
|
|
else:
|
|
policy_type = entity_config['time_variate_thresholds_specification']['policies'][policy_key]['policy_type']
|
|
time_blocks = entity_config['time_variate_thresholds_specification']['policies'][policy_key]['time_blocks']
|
|
if params['incremental_learning_enabled'] and should_create_incremental_values and policy_type in ('stdev', 'range', 'percentage', 'quantile'):
|
|
dynamic_params = [{'severityValue': tl['severityValue'], 'dynamicParam': tl['dynamicParam']} for tl in entity_config['time_variate_thresholds_specification']['policies'][policy_key]['entity_thresholds']['thresholdLevels']]
|
|
kpi_at_incremental_values['policies'][policy_key] = { 'policy_type': policy_type, 'time_blocks': time_blocks, 'dynamic_params': dynamic_params }
|
|
kpi_at_incremental_values['policies'][policy_key]['incremental_values'] = self.policies[policy_key].get_incremental_values(the_data, params, entity_config)
|
|
kpi_at_incremental_values["adaptive_thresholding_training_window"] = entity_config['adaptive_thresholding_training_window']
|
|
if 'aggregate_outlier_detection_enabled' in entity_config:
|
|
kpi_at_incremental_values["aggregate_outlier_detection_enabled"] = entity_config['aggregate_outlier_detection_enabled']
|
|
kpi_at_incremental_values["outlier_detection_algo"] = entity_config['outlier_detection_algo']
|
|
kpi_at_incremental_values["outlier_detection_sensitivity"] = entity_config['outlier_detection_sensitivity']
|
|
self.incremental_values = kpi_at_incremental_values
|
|
T[policy_key] = self.policies[policy_key].get_thresholds(the_data, params['kpi'], False, entity_config)
|
|
|
|
if T[policy_key] is None and policy_key in active_policies:
|
|
insufficient_data_policies.append(self.policies[policy_key].title)
|
|
self.logger.info(
|
|
"Insufficient data for threshold calculation: %d values." % len(D[policy_key]))
|
|
|
|
if len(insufficient_data_policies) > 0:
|
|
add_message(params['out_metadata'], 'WARN',
|
|
'insufficient data in ITSI summary index for policies %s' % str(insufficient_data_policies))
|
|
return T
|
|
|
|
def get_thresholds(self, data, params, entity_config={}):
|
|
"""Computes thresholds for a KPI and this schedule.
|
|
|
|
:param data: dict with 'alert_value': list of floats
|
|
'_time': list of float epoch timestamps
|
|
:param params: dict with kpi settings
|
|
:param entity_config: entity level configuration object
|
|
Returns a dict of lists of threshold level structures, keyed by policy.key;
|
|
the structures should have a populated `thresholdValue` field obtained from the result of the computation
|
|
|
|
"""
|
|
metadata = params['out_metadata']
|
|
thresholds = {}
|
|
kpi_info = 'kpiid="%s" on serviceid="%s"' % (str(params['kpi']['kpi_id']), str(params['kpi']['service_id']))
|
|
try:
|
|
thresholds = self._get_thresholds(data=data, params=params, entity_config=entity_config)
|
|
except ValueError:
|
|
log_and_warn(metadata=metadata, logger=self.logger,
|
|
msg='Unconvertible alert_values found for ' + kpi_info,
|
|
search_msg="unconvertible values found (check this KPI's `alert_value` "
|
|
"field in ITSI summary index")
|
|
except AssertionError:
|
|
# Method should probably raise a ValueError/try to convert 0-100 to 0.0-1.0, but for now log nicely
|
|
log_and_warn(metadata=metadata, logger=self.logger,
|
|
msg='Invalid quantile specified for %s, must be between 0.0 and 1.0' % kpi_info,
|
|
search_msg='invalid quantile value, must be between 0.0 and 1.0')
|
|
except Exception as e:
|
|
log_and_warn(metadata=metadata, logger=self.logger, msg=str(e))
|
|
log_and_warn(metadata=metadata, logger=self.logger,
|
|
msg='Unexpected exception when computing thresholds for %s' % kpi_info)
|
|
|
|
return thresholds
|
|
|
|
|
|
def create_schedule(params, entity_config=None):
|
|
policies = {}
|
|
metadata = params['out_metadata']
|
|
settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings']
|
|
logger = params['logger']
|
|
|
|
# get policy settings for this KPI, create Policy objects
|
|
for policy_key in settings['policies']:
|
|
t_method = str(settings['policies'][policy_key]['policy_type'])
|
|
t_title = str(settings['policies'][policy_key].get('title', policy_key))
|
|
try:
|
|
t_levels = settings['policies'][policy_key][params['threshold_key']]['thresholdLevels']
|
|
except KeyError as e:
|
|
# we just skip this policy
|
|
logger.exception(e)
|
|
log_and_warn(metadata=metadata, logger=logger, msg="Failed to retrieve %ss: %s" % (params['threshold_key'], e))
|
|
continue
|
|
|
|
policy_key = str(policy_key)
|
|
if t_method == 'static':
|
|
logger.info("Skipping static policy '%s'", policy_key)
|
|
elif not isinstance(t_levels, list) or not t_levels:
|
|
log_and_warn(metadata=metadata, logger=logger,
|
|
msg="Unable to apply adaptive thresholding on policy '%s': please specify threshold values "
|
|
"for the policy" % t_title)
|
|
continue
|
|
else:
|
|
skip_policy = False
|
|
for x in t_levels:
|
|
if 'dynamicParam' not in x:
|
|
log_and_warn(metadata=metadata, logger=logger,
|
|
msg="Unable to apply adaptive thresholding on policy '%s': Missing threshold "
|
|
"value." % t_title)
|
|
skip_policy = True
|
|
break
|
|
try:
|
|
float(x['dynamicParam'])
|
|
except (TypeError, ValueError):
|
|
log_and_warn(metadata=metadata, logger=logger,
|
|
msg="Unable to apply adaptive thresholding on policy '%s': Invalid threshold "
|
|
"value: %s" % (t_title, x['dynamicParam']))
|
|
skip_policy = True
|
|
break
|
|
if skip_policy:
|
|
continue
|
|
|
|
logger.debug("Loading settings for policy %s: method=%s levels=%s" % (
|
|
policy_key, t_method, t_levels))
|
|
try:
|
|
at_run_params = {'at_run_epoch': params['at_run_epoch'],
|
|
'use_temp_collection': params['use_temp_collection'],
|
|
'service_id': params['kpi']['service_id'],
|
|
'training_window': params['kpi']['adaptive_thresholding_training_window']}
|
|
policies[policy_key] = Policy(
|
|
key=policy_key, method=t_method, parameters=t_levels, title=t_title, logger=logger, at_run_params=at_run_params)
|
|
except ValueError as e:
|
|
logger.exception(e)
|
|
log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Policy.")
|
|
|
|
the_schedule = None
|
|
if len(policies) == 0:
|
|
return
|
|
try:
|
|
the_schedule = Schedule(
|
|
kpi_object=params['kpi']['kpi_object'], policies=policies, threshold_spec=settings, params=params)
|
|
except ValueError as e:
|
|
logger.exception(e)
|
|
log_and_warn(metadata=metadata, logger=logger, msg="Invalid arguments sent to Schedule.")
|
|
return the_schedule
|
|
|
|
|
|
def output_results(at_command, params, thresholds, data, entity_config=None):
|
|
"""
|
|
thresholds: dict of lists of threshold levels structures, keyed by policy id
|
|
"""
|
|
settings = entity_config['time_variate_thresholds_specification'] if entity_config else params['kpi']['settings']
|
|
service_id = params['kpi']['service_id']
|
|
kpi_id = params['kpi']['kpi_id']
|
|
|
|
if not thresholds and not at_command:
|
|
alerts_converted = data["alert_value"]
|
|
time_converted = data["_time"]
|
|
filter = PolicyFilter(settings)
|
|
for index in range(len(time_converted)):
|
|
try:
|
|
alert_val = alerts_converted[index]
|
|
time_val = time_converted[index]
|
|
policy_key = filter.get_policy_key(time_val)
|
|
line = {
|
|
'policy_key': policy_key, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id,
|
|
'alert_value': alert_val, '_time': time_val
|
|
}
|
|
if entity_config:
|
|
line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']})
|
|
except IndexError:
|
|
raise Exception(data)
|
|
params['writer'].writerow(line)
|
|
else:
|
|
for policy_id in thresholds:
|
|
t = thresholds[policy_id]
|
|
if t is not None:
|
|
if params['use_kv_store']:
|
|
if len(t) != len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels']):
|
|
kpistr = ""
|
|
if service_id is not None and kpi_id is not None and service_id != "" and kpi_id != "":
|
|
kpistr = " for kpi %s" % str(service_id) + ":" + str(kpi_id)
|
|
found = len(settings['policies'][policy_id][params['threshold_key']]['thresholdLevels'])
|
|
msg = "Mismatched number of thresholdLevels: %s. Generated %d but found %d." % (
|
|
kpistr, len(t), found)
|
|
log_and_warn(metadata=params['out_metadata'], logger=params['logger'], msg=msg)
|
|
else:
|
|
# n.b. we assume thresholdLevels objects are
|
|
# sorted by increasing thresholdValue
|
|
# move this update_thresholds to outside
|
|
if entity_config:
|
|
params['entity_threshold_object'].update_thresholds(
|
|
policy=policy_id, thresholds=t, entity=entity_config)
|
|
else:
|
|
params['kpi']['kpi_object'].update_thresholds(
|
|
policy=policy_id, thresholds=t)
|
|
|
|
line = {
|
|
'policy_id': policy_id, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id}
|
|
if entity_config:
|
|
line.update({'entity_key': entity_config['entity_key'], 'entity_title': entity_config['entity_title']})
|
|
|
|
for thresh_index in range(len(t)):
|
|
line['threshold_' + str(thresh_index)] = t[thresh_index].get('thresholdValue')
|
|
line['threshold_metadata_' + str(thresh_index)] = t[thresh_index]
|
|
params['writer'].writerow(line)
|
|
if params['kpi']['adaptive_thresholding_copy_kpi_to_entity'] and not entity_config:
|
|
params['kpi']['kpi_object'].copy_kpi_thresholds_to_base_entity_thresholds()
|
|
|
|
return
|
|
|
|
|
|
def ignore_invalid_row(warn_message, logger):
|
|
"""
|
|
Method to log warning and ignore read row result
|
|
Assumes read_chunk was invoked before this method is invoked
|
|
|
|
@type: basestring
|
|
@param warn_message: warning message to log
|
|
|
|
@rtype: None
|
|
@return: None
|
|
"""
|
|
logger.warn(warn_message)
|
|
# Dummy response to ignore
|
|
write_chunk(sys.stdout, {"finished": False}, '')
|
|
|
|
|
|
def gather_input_data(params, logger, fields_list):
|
|
kpidict = dict() # kpidict['itsi_service_id']['itsi_kpi_id']
|
|
while True:
|
|
params['out_metadata']['finished'] = False
|
|
ret = read_chunk(sys.stdin, logger)
|
|
if not ret:
|
|
break
|
|
metadata, body = ret
|
|
parse_input_data(
|
|
the_dict=kpidict, data=body, fields_list=fields_list, params=params)
|
|
write_chunk(sys.stdout, params['out_metadata'], '')
|
|
if metadata.get('finished', False):
|
|
break
|
|
params['kpidict'] = kpidict
|
|
params['outbuf'] = StringIO()
|
|
|
|
|
|
def last_dst_change_timestamp(logger, dst_timezone):
|
|
"""
|
|
Returns the last dst change timestamp for the provided timestamp
|
|
|
|
@type logger: logger object
|
|
@param logger: logger object
|
|
|
|
@type: string
|
|
@param dst_timezone: timezone provided by the user in itsi_settings.conf
|
|
"""
|
|
try:
|
|
# Creates a time zone object based on the provided timezone_name
|
|
try:
|
|
tz = pytz.timezone(dst_timezone)
|
|
except UnknownTimeZoneError as e:
|
|
logger.exception(e)
|
|
logger.error('Found Unknown timezone')
|
|
|
|
if tz:
|
|
# Gets the current time in the specified time zone.
|
|
now = datetime.now(tz)
|
|
|
|
# Gets a list of UTC transition times for the time zone.
|
|
# These are the times at which the offset from UTC changes due to daylight
|
|
# saving time changes or other reasons.
|
|
# For example [
|
|
# datetime.datetime(2023, 3, 12, 10, 0),
|
|
# datetime.datetime(2023, 11, 5, 9, 0),
|
|
# datetime.datetime(2024, 3, 10, 10, 0),
|
|
# datetime.datetime(2024, 11, 3, 9, 0),
|
|
# datetime.datetime(2025, 3, 9, 10, 0),
|
|
# datetime.datetime(2025, 11, 2, 9, 0)
|
|
# ]
|
|
transitions = list(tz._utc_transition_times)
|
|
|
|
# Converts each UTC transition time to a datetime object in UTC time zone
|
|
# [
|
|
# datetime.datetime(2023, 3, 12, 10, 0, tzinfo=<UTC>),
|
|
# datetime.datetime(2023, 11, 5, 9, 0, tzinfo=<UTC>),
|
|
# datetime.datetime(2024, 3, 10, 10, 0, tzinfo=<UTC>),
|
|
# datetime.datetime(2024, 11, 3, 9, 0, tzinfo=<UTC>),
|
|
# datetime.datetime(2025, 3, 9, 10, 0, tzinfo=<UTC>),
|
|
# datetime.datetime(2025, 11, 2, 9, 0, tzinfo=<UTC>)
|
|
# ]
|
|
transitions = [pytz.utc.localize(transition) for transition in transitions]
|
|
|
|
# Sorts the list of transition times in ascending order.
|
|
transitions.sort()
|
|
last_dst_change = None
|
|
|
|
for transition_time in transitions:
|
|
# If the transition time is before the current time,
|
|
# it updates the last_dst_change variable to this transition time.
|
|
if transition_time < now:
|
|
last_dst_change = transition_time
|
|
else:
|
|
break
|
|
|
|
if last_dst_change:
|
|
# Converts the time object to a timestamp e.g. 1710064799
|
|
return round((last_dst_change - timedelta()).timestamp(), 2)
|
|
else:
|
|
logger.error('Failed to fetch last dst change timestamp')
|
|
return None
|
|
else:
|
|
return None
|
|
logger.error('Failed to fetch last dst change timestamp')
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error('Failed to fetch last dst change timestamp')
|
|
return None
|
|
|
|
|
|
def get_at_dst_changes_details(logger, session_key):
|
|
"""
|
|
Fetches the dst_changes details from apply_dst_to_at stanza from itsi_settings.conf
|
|
and returns the required data to apply dst changes
|
|
|
|
@type logger: logger object
|
|
@param logger: logger object
|
|
|
|
@type: string
|
|
@param session_key: the splunkd session key for the request
|
|
"""
|
|
# Fetch data from apply_dst_to_at stanza of itsi_settings.conf
|
|
cfm = ConfManager(session_key, 'SA-ITOA')
|
|
conf = cfm.get_conf('itsi_settings')
|
|
try:
|
|
apply_dst_to_at = conf.get('apply_dst_to_at')
|
|
disable_dst_to_at = int(apply_dst_to_at.get('disabled', 1))
|
|
dst_timezone = apply_dst_to_at.get('timezone', '')
|
|
dst_offset = int(apply_dst_to_at.get('offset', 0))
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error('Failed to fetch dst settings for the threshold calculation')
|
|
|
|
days_since_dst = 0
|
|
dst_change_timestamp = 0
|
|
|
|
if(not disable_dst_to_at):
|
|
if dst_timezone and dst_offset != 0:
|
|
# Fetch the last dst change timestamp as per the provided timezone
|
|
dst_change_timestamp = last_dst_change_timestamp(logger, dst_timezone)
|
|
if dst_change_timestamp:
|
|
|
|
# Calculate the number of days past dst based on timestamp
|
|
last_dst_change_datetime = datetime.utcfromtimestamp(dst_change_timestamp)
|
|
current_datetime = datetime.utcnow()
|
|
days_since_dst = (current_datetime - last_dst_change_datetime).days
|
|
|
|
# Disable apply_dst_to_at in itsi_settings.conf if more than 60 days are past since dst changes
|
|
if days_since_dst > 60:
|
|
try:
|
|
conf.update('apply_dst_to_at', {'disabled': 1})
|
|
disable_dst_to_at = 1
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error('Failed to update dst disabled settings for the threshold calculation')
|
|
else:
|
|
try:
|
|
conf.update('apply_dst_to_at', {'disabled': 1})
|
|
disable_dst_to_at = 1
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error('Failed to update dst disabled settings for the threshold calculation')
|
|
logger.error('Could not find a timestamp for the provided timezone. Hence DST changes to AT will not be applied')
|
|
else:
|
|
try:
|
|
conf.update('apply_dst_to_at', {'disabled': 1})
|
|
disable_dst_to_at = 1
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error('Failed to update dst disabled settings for the threshold calculation')
|
|
logger.error('Could not find a timezone or an offset to apply DST changes to AT. Hence DST changes to AT will not be applied')
|
|
|
|
return disable_dst_to_at, dst_timezone, dst_change_timestamp, dst_offset
|
|
|
|
|
|
def serialize_digest_data(incremental_value, field):
|
|
"""
|
|
Serializes tdigest objects to be stored in the incremental values collection
|
|
|
|
@type: dict
|
|
@param: incremental_value: incremental value with tdigest data
|
|
|
|
@type: string
|
|
@param: field: field containing the tdigest data
|
|
|
|
@rtype: dict
|
|
@return: serialized tdigest data
|
|
"""
|
|
digest = incremental_value[field]
|
|
centroids = []
|
|
for key in digest.C.keys():
|
|
tree_values = digest.C.get_value(key)
|
|
centroids.append({'m': tree_values.mean, 'c': tree_values.count})
|
|
return {'n': digest.n, 'delta': digest.delta, 'K': digest.K, 'centroids': centroids}
|
|
|
|
|
|
def chunker(params, at_command=False):
|
|
logger = params['logger']
|
|
kpidict = params['kpidict']
|
|
params['outbuf'] = StringIO()
|
|
at_incremental_values = []
|
|
|
|
if at_command:
|
|
fields_list = ['policy_id']
|
|
for k in range(10):
|
|
fields_list.append("threshold_" + str(k))
|
|
fields_list.append("threshold_metadata_" + str(k))
|
|
fields_list = fields_list + ['itsi_service_id', 'itsi_kpi_id']
|
|
|
|
if params['entity_level_thresholds']:
|
|
fields_list = fields_list + ['entity_key', 'entity_title']
|
|
else:
|
|
fields_list = ['policy_key', 'itsi_service_id', 'itsi_kpi_id', 'alert_value', '_time']
|
|
|
|
# prepare for generating output
|
|
params['out_metadata']['finished'] = False
|
|
|
|
# Create a dict writer with IO
|
|
params['writer'] = csv.DictWriter(params['outbuf'], fieldnames=fields_list, dialect='excel', extrasaction='ignore')
|
|
params['writer'].writeheader()
|
|
|
|
# Get the service object
|
|
params['service_object'] = get_service_object(params)
|
|
params['disable_dst_to_at'], params['dst_timezone'], params['dst_change_timestamp'], params['dst_offset'] = \
|
|
get_at_dst_changes_details(logger, session_key=params["session_key"])
|
|
|
|
# Bulk fetch the services of targeted kpis
|
|
if params['service_object']:
|
|
params['service_object'].bulk_fetch_service(kpidict.keys())
|
|
|
|
if params['entity_level_thresholds']:
|
|
params['entity_threshold_object'] = EntityThreshold(logger=params['logger'])
|
|
|
|
params['entity_threshold_object'].initialize_interface(
|
|
params['session_key'], owner='nobody')
|
|
|
|
if at_command:
|
|
list_kpis = []
|
|
|
|
for itsi_service_id in kpidict:
|
|
for itsi_kpi_id in kpidict[itsi_service_id]:
|
|
list_kpis.append(itsi_kpi_id)
|
|
|
|
# Get the Active Custom Threshold Windows which are of type percentage
|
|
ctw_object = CustomThresholdWindow(logger=logger)
|
|
ctw_object.initialize_interface(
|
|
params['session_key'], owner='nobody')
|
|
ctw_linked_kpis = ctw_object.bulk_fetch_active_ctw(list_kpis)
|
|
|
|
# Phase 2: iterate over (serviceid, kpiid) and output scores
|
|
for itsi_service_id in kpidict:
|
|
params['kpi'] = {
|
|
'service_id': itsi_service_id,
|
|
'service_data': None
|
|
}
|
|
if params['service_object']:
|
|
# save the service data
|
|
params['kpi']['service_data'] = params['service_object'].fetch_service(itsi_service_id)
|
|
|
|
for itsi_kpi_id in kpidict[itsi_service_id]:
|
|
params['kpi']['kpi_id'] = itsi_kpi_id
|
|
# get the KPI object
|
|
params['kpi']['kpi_object'] = get_kpi_object(params)
|
|
if params['kpi']['kpi_object'] is None:
|
|
ignore_invalid_row('No KPI found with id %s, ignoring ...' % itsi_kpi_id, logger)
|
|
continue
|
|
|
|
# get the settings
|
|
kpi_tmp = params['kpi']['kpi_object'].get_kpi()
|
|
|
|
if not isinstance(kpi_tmp, dict):
|
|
ignore_invalid_row('No valid KPI found with id %s, ignoring ...' % itsi_kpi_id, logger)
|
|
continue
|
|
|
|
if 'time_variate_thresholds_specification' not in kpi_tmp:
|
|
ignore_invalid_row(
|
|
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id,
|
|
logger
|
|
)
|
|
continue
|
|
|
|
params['kpi']['entity_thresholds'] = {}
|
|
if params['entity_level_thresholds']:
|
|
list_entity_keys = kpidict[itsi_service_id][itsi_kpi_id].keys()
|
|
entity_threshold_configs = params['entity_threshold_object'].bulk_fetch_configs(itsi_kpi_id, list_entity_keys, params['pseudo_entities'])
|
|
|
|
if not entity_threshold_configs:
|
|
break
|
|
|
|
for entity_config in entity_threshold_configs:
|
|
# Create temp Entity Key to store persistent entity config in a global object
|
|
entity_key = entity_config.get("entity_key") if entity_config.get("entity_key") != 'N/A' else hashlib.md5(( entity_config['entity_title'] + entity_config['kpi_id']).encode("utf-8")).hexdigest()
|
|
params['kpi']['entity_thresholds'].update({entity_key: entity_config})
|
|
|
|
params['kpi']['settings'] = kpi_tmp[
|
|
'time_variate_thresholds_specification']
|
|
params['kpi']['detect_outliers'] = False
|
|
params['kpi']['outlier_detection_algo'] = None
|
|
params['kpi']['outlier_multiplier'] = None
|
|
params['kpi']['adaptive_thresholding_training_window'] = kpi_tmp['adaptive_thresholding_training_window']
|
|
params['kpi']['adaptive_thresholding_copy_kpi_to_entity'] = (
|
|
kpi_tmp['adaptive_thresholding_copy_kpi_to_entity']
|
|
if 'adaptive_thresholding_copy_kpi_to_entity' in kpi_tmp
|
|
else False
|
|
)
|
|
if 'aggregate_outlier_detection_enabled' in kpi_tmp:
|
|
params['kpi']['detect_outliers'] = False if params['entity_level_thresholds'] else kpi_tmp['aggregate_outlier_detection_enabled']
|
|
if kpi_tmp['aggregate_outlier_detection_enabled']:
|
|
if 'outlier_detection_algo' in kpi_tmp:
|
|
params['kpi']['outlier_detection_algo'] = kpi_tmp['outlier_detection_algo']
|
|
if 'outlier_detection_sensitivity' in kpi_tmp:
|
|
params['kpi']['outlier_multiplier'] = kpi_tmp['outlier_detection_sensitivity']
|
|
|
|
if at_command:
|
|
if kpi_tmp['_key'] in ctw_linked_kpis and kpi_tmp['adaptive_thresholds_is_enabled']:
|
|
kpi_tmp['recalculate_custom_thresholds'] = True
|
|
|
|
if params['kpi']['settings'] is not None:
|
|
|
|
if params['entity_level_thresholds']:
|
|
for entity_key in kpidict[itsi_service_id][itsi_kpi_id]:
|
|
# Ignore Entity from data if we don't have configuration available as kpi_entity_threshold
|
|
if not params['kpi']['entity_thresholds'].get(entity_key):
|
|
continue
|
|
schedule = calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id][entity_key], entity_config=params['kpi']['entity_thresholds'][entity_key])
|
|
if schedule and schedule.incremental_values:
|
|
at_incremental_values.append(schedule.incremental_values)
|
|
else:
|
|
schedule = calculate_thresholds(at_command, params=params, data=kpidict[itsi_service_id][itsi_kpi_id], entity_config=None)
|
|
if schedule and schedule.incremental_values:
|
|
at_incremental_values.append(schedule.incremental_values)
|
|
else:
|
|
ignore_invalid_row(
|
|
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id,
|
|
logger
|
|
)
|
|
continue
|
|
if len(at_incremental_values):
|
|
at_incremental_values_obj = ItsiAtIncrementalValues(params["session_key"], 'nobody')
|
|
at_incremental_values_ = copy.deepcopy(at_incremental_values)
|
|
for kpi_ in at_incremental_values_:
|
|
for policy in kpi_["policies"]:
|
|
if 'incremental_values' in kpi_["policies"][policy]:
|
|
for value in kpi_["policies"][policy]["incremental_values"]:
|
|
if 'digest' in value:
|
|
value["digest"] = serialize_digest_data(value, 'digest')
|
|
if 'unfiltered_digest' in value:
|
|
value["unfiltered_digest"] = serialize_digest_data(value, 'unfiltered_digest')
|
|
at_incremental_values_obj.save_batch('nobody', at_incremental_values_, validate_names=False)
|
|
|
|
# Write output datain buffer
|
|
write_chunk(sys.stdout, params['out_metadata'], params['outbuf'].getvalue())
|
|
|
|
|
|
def calculate_thresholds(at_command, params, data, entity_config=None):
|
|
"""
|
|
Calculate Thresholds from data dict
|
|
|
|
@type: dict
|
|
@param data: Data points dictionary to parse to get new threshold values
|
|
|
|
@type: dict
|
|
@param entity_config: Entity Level Configuration object. Only for Entity level thresholding
|
|
"""
|
|
the_schedule = None
|
|
if at_command:
|
|
# create the schedule
|
|
the_schedule = create_schedule(params=params, entity_config=entity_config)
|
|
values = clean_values(
|
|
data=data,
|
|
params=params
|
|
)
|
|
|
|
thresholds = None
|
|
if at_command:
|
|
# compute the thresholds
|
|
if the_schedule is not None:
|
|
thresholds = the_schedule.get_thresholds(data=values, params=params, entity_config=entity_config)
|
|
else:
|
|
thresholds = {}
|
|
values['policy_key'] = []
|
|
|
|
# write output to buffer
|
|
output_results(at_command=at_command, params=params, thresholds=thresholds, data=values, entity_config=entity_config)
|
|
|
|
return the_schedule
|
|
|
|
|
|
def remove_outliers(data, method, multiplier):
|
|
if method is None:
|
|
method = "stdev"
|
|
if multiplier is None:
|
|
multiplier = 2
|
|
return apply_outlier_algorithm(data, method, multiplier, remove=True)
|
|
|
|
|
|
def apply_outlier_algorithm(data, method, multiplier, remove=False):
|
|
if data is None or 'alert_values' not in data:
|
|
raise Exception("Data is empty or not in correct format for applying outlier algorithm")
|
|
list_of_tuples = data['alert_values']
|
|
arr_floats = [float(i[0]) for i in list_of_tuples]
|
|
median = statistics.median(arr_floats)
|
|
# Calculate bounds
|
|
if method.lower() == 'mad':
|
|
mad = statistics.median([abs(val - median) for val in arr_floats])
|
|
upper_bound = median + (float(multiplier) * mad)
|
|
lower_bound = median - (float(multiplier) * mad)
|
|
elif method.lower() == 'iqr':
|
|
arr_floats = sorted(arr_floats)
|
|
iqr = quantile(arr_floats, 0.75) - quantile(arr_floats, 0.25)
|
|
upper_bound = median + (float(multiplier) * iqr)
|
|
lower_bound = median - (float(multiplier) * iqr)
|
|
elif method.lower() == 'stdev':
|
|
mean = statistics.mean(arr_floats)
|
|
stdev = statistics.stdev(arr_floats)
|
|
upper_bound = mean + (float(multiplier) * stdev)
|
|
lower_bound = mean - (float(multiplier) * stdev)
|
|
else:
|
|
raise Exception("Unsupported outlier detection method: %s" % method)
|
|
updated_values = []
|
|
outliers = []
|
|
# Iterate over data and mark outliers
|
|
for x in list_of_tuples:
|
|
x_list = list(x)
|
|
try:
|
|
x_list[3] = lower_bound
|
|
x_list[4] = upper_bound
|
|
except IndexError:
|
|
raise Exception(x_list)
|
|
if float(x[0]) > upper_bound or float(x[0]) < lower_bound:
|
|
x_list[2] = True
|
|
outliers.append(tuple(x_list))
|
|
if remove:
|
|
continue
|
|
updated_values.append(tuple(x_list))
|
|
data['alert_values'] = updated_values
|
|
return outliers, lower_bound, upper_bound
|
|
|
|
|
|
def detect_outliers(params, policy_chunks):
|
|
"""
|
|
Updates policy_chunks dictionary with outliers detected based on the method chosen
|
|
also returns outliers in a separate dictionary
|
|
"""
|
|
policy_outlier_map = collections.OrderedDict()
|
|
logger = params['logger']
|
|
# Identify outliers per policy block
|
|
for k, v in policy_chunks.items():
|
|
data = {'alert_values': v}
|
|
outliers, _, _ = apply_outlier_algorithm(data, params['method'].lower(), params['multiplier'])
|
|
logger.debug("%s outliers identified for method: %s, multiplier: %s, outliers: %s" % (len(outliers), params['method'], params['multiplier'], outliers))
|
|
policy_outlier_map[k] = outliers
|
|
policy_chunks[k] = data['alert_values']
|
|
return policy_outlier_map
|