You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

657 lines
29 KiB

from __future__ import print_function, division
import logging
import re
import csv
import sys
import copy
import math
from chunked_util import read_chunk, write_chunk, add_message
from atad_utils import parse_input_data, clean_values, log_and_warn, log_and_die
import kpi
import custom_threshold_window
try:
# Python 2 case
from cStringIO import StringIO
except ImportError:
# Python 3 case
from io import StringIO
from splunk.clilib.bundle_paths import make_splunkhome_path
sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib']))
import itsi_path
from ITOA.setup_logging import setup_logging
from itsi.itsi_time_block_utils import PolicyFilter
# Set this constant if you want to enable file-based KPI specification
# (useful for debugging without accessing the KV store).
# **** DO NOT SET THIS IN PRODUCTION (ITOA-3809) ****
# ENABLE_FILE_ARGUMENT = 1
##################
# itsiat
##################
# Command logs to $SPLUNK_HOME/var/log/splunk/itsi-atad.log
# contents of searchbnf.conf:
# [itsiat-command]
# syntax = itsiat (nokv) (file=<filename containing kpi json object>) (usetempcollection) (collection=<string: name of the collection>) (key=<string: object key>)
# description = Computes thresholds based on the input data and according to the schedules and policies specified in settings (in nokv mode) or found in the kv store (default). The data is partitioned according to which block of the schedule it corresponds to, then thresholds are computed for each block according to the rules in the associated policy. If any policies of any KPIs lack sufficient data to compute the thresholds as specified, the command will return no thresholds for that policy and will not update the corresponding thresholds. The _time field should be in UTC epoch time with the timezone specified in the KPI and that timezone should correspond with the timezone in which the time blocks are specified. No thresholds will be returned (or written to the KV store) for any KPIs for which an error was encountered; otherwise, the computed threhsolds will be output even if multiple thresholds have the same value. The command returns thresholds via stdout, and may additionally write them to the KV store if the appropriate arguments are passed. The empty string '' is an invalid value for all fields.
# shortdesc = Computes adaptive thresholds for the given data and kpi information (which it uses to acquire schedules and policies from the kv store).
# comment1 = An example using the command with the KV store (the 'table' command is optional):
# example1 = | table _time alert_value itsi_service_id itsi_kpi_id | itsiat
# comment2 = You can also pass a filename containing the kpi json directly to the command and receive the results as events (replace $SPLUNK_HOME with the correct path):
# example2 = | table _time alert_value itsi_service_id itsi_kpi_id | itsiat nokv file=$SPLUNK_HOME/etc/apps/SA-ITSI-ATAD/bin/test/SHKPI.json
# comment3 = You can use the command with a temporary collection in the KV store like this:
# example3 = | table _time alert_value itsi_service_id itsi_kpi_id | itsiat usetempcollection collection=temp_kpi_collection key=857d4397893137141fb6c427
# usage = public
# tags = kpi adaptive thresholding dynamic thresholds schedule blocks policy
# [itsiat-nokv-option]
# syntax = nokv
# description = When present, this flag makes the command use a file (specified in the settings argument) instead of the KV store to acquire the policies and schedules. The computed thresholds are returned as events.
# [itsiat-file-option]
# syntax = file=<filename containing KPI JSON object>
# description = In interactive mode (pass the "nokv" flag), the "file" parameter takes a filename containing the plaintext JSON of a KPI object. This has the Time Block and Threshold Policy data structures under the 'time_variate_thresholds_specification' key, which, in KV mode, the command retrieves from the KV store. If the nokv flag is not present, this argument is ignored.
# [itsiat-usetempcollection-option]
# syntax = usetempcollection
# description = When present, this flag makes the command use temporary collection in the KV store. The collection name and object key must both be provided. If the nokv flag is also present, the command throws an error.
# [itsiat-collection-option]
# syntax = collection=<string: temp collection name>
# description = The name of the temporary collection to use.
# [itsiat-key-option]
# syntax = key=<string: temp object key>
# description = The key to use for the object in the temporary collection.
# Windows will mangle our line-endings unless we do this.
if sys.platform == "win32":
import os
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stderr.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
logger = setup_logging("itsi_atad.log", "itsi.at", level=logging.DEBUG)
def quantile(data, q):
"""Naive implementation of linear-interpolated quantile.
Comparable to numpy.percentile()/pd.DataFrame.quantile().
Author: Jacob Leverich (jleverich@splunk.com)
"""
assert q >= 0. and q <= 1.
m = float(len(data) - 1)
i = m * q
ilow = math.floor(i)
ihigh = math.ceil(i)
if ilow == ihigh:
return data[int(ilow)]
f = (i - ilow) / (ihigh - ilow)
low = data[int(ilow)]
high = data[int(ihigh)]
return low + f * (high - low)
def quantiles(data, levels):
data = sorted(data)
out = {
l: quantile(data, float(l))
for l in levels
}
return out
# Policy Class
class Policy(object):
def __init__(self, key, method, parameters, **kwargs):
# validate methods and parameters
if not isinstance(key, str):
raise ValueError(
"Null or non-string key sent to Policy constructor.")
if not isinstance(method, str):
raise ValueError(
"Null or non-string method sent to Policy constructor. Must be a string: stdev, quantile, range, or percentage.")
method_str = str(method)
if method_str not in ['stdev', 'quantile', 'range', 'percentage']:
raise ValueError(
"Method must be one of stdev, quantile, range, or percentage.")
if not parameters: # parameters is a list of theshold levels
raise ValueError("Null parameters sent to Policy constructor.")
if not isinstance(parameters, list) or len(parameters) > 10:
raise ValueError(
"Parameters must be a list of no more than 10 levels.", parameters)
if not all('dynamicParam' in x for x in parameters):
raise ValueError("Every level record must have a dynamicParam attribute")
# store policies in form amenable to computing thresholds
self.key = key
self.method = method_str
self.parameters = parameters
self.title = kwargs.get('title', key)
@property
def parameter_values(self):
# property that extracts dynamic param values from parameter list
return [float(x['dynamicParam']) for x in self.parameters]
def get_updated_levels(self, computed_thresholds):
"""
Returns a copy of the levels structure stored in self.parameters
where thresholdValue field is updated from the computed levels array
"""
if len(computed_thresholds) != len(self.parameters):
raise ValueError("Computed thresholds and stored thresholds structures are not of the same length")
result = []
for computed_value, level in zip(computed_thresholds, self.parameters):
level_copy = copy.copy(level)
level_copy['thresholdValue'] = computed_value
result.append(level_copy)
logger.debug("Updated thresholdLevels for policy %s are %s", self.key, result)
return result
# values: a dict with ['alert_value'] = floats (possibly non-contiguous
# and out-of-order) from all the blocks that have this policy.
# returns a copy of threshold levels structure with thresholdValue field updated
def get_thresholds(self, values):
D = {'alert_value': [v for v in values if not math.isnan(v)]}
if self.method is None:
raise UnboundLocalError("No method set for Policy.")
if len(D['alert_value']) < 30:
logger.error("There are less than 30 data points in policy: %s" % self.key)
return None
if self.method == 'stdev': # pretty standard, really
# Simple two-pass algorithm for calculating stdev. Reasonably numerically stable.
mean = sum(D['alert_value']) / len(D['alert_value'])
sqe = sum((x - mean) ** 2. for x in D['alert_value'])
std = math.sqrt(sqe / (len(D['alert_value']) - 1))
return self.get_updated_levels([mean + (std * c) for c in self.parameter_values])
# formerly iqr and same as "mass" in prior iterations
elif self.method == 'quantile':
T = quantiles(D['alert_value'], self.parameter_values)
return self.get_updated_levels([T[k] for k in self.parameter_values])
elif self.method == 'range': # equal width bands
# NOTE: sensitive to outliers in training data (remove first)
dmax = max(D['alert_value'])
dmin = min(D['alert_value'])
span = dmax - dmin
return self.get_updated_levels([dmin + (span * c) for c in self.parameter_values])
elif self.method == 'percentage':
# Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage
mean = sum(D['alert_value']) / len(D['alert_value'])
return self.get_updated_levels([mean * (1 + c/100) for c in self.parameter_values])
else:
ValueError("Invalid thresholding method: " + self.method)
# Schedule Class
class Schedule(object):
# policies: dict of Policy Objects keyed by policy.key
# schedule: dict of policy_keys keyed by block_keys
def __init__(self, kpi_object, policies, threshold_spec):
# validate kpi
if kpi_object is None:
raise ValueError("Null KPI object sent to Schedule constructor.")
if not isinstance(kpi_object, kpi.KPIBase):
raise ValueError("KPI parameter must be a kpi.KPI object", kpi)
# validate policies
if policies is None:
raise ValueError("Null policy dict sent to Schedule constructor.")
if not isinstance(policies, dict):
raise ValueError(
"Policies parameter must be a dict, got %s." % type(policies))
if len(policies) > 169 or len(policies) == 0:
raise ValueError(
"Policies parameter must be a dict of no more than 168 Policy objects, got %s." % len(policies))
if sum([1 if not isinstance(p, Policy) else 0 for p in list(policies.values())]) > 0:
raise ValueError("All policies must be Policy objects.")
self.kpi_object = kpi_object
self.policies = policies
self.filter = PolicyFilter(threshold_spec)
def _get_thresholds(self, data, params):
if data is None:
raise ValueError("Null data sent to Schedule.")
if not isinstance(data, dict) or 'alert_value' not in data:
raise ValueError(
"Data passed to Schedule must be a dict with values in column 'alert_value'.")
# divide data based on policy: D[policy_key] = [floats]
D = {}
for policy_key in self.policies:
D[policy_key] = []
index_converted = data['_time']
active_policies = set()
for data_index in range(len(index_converted)):
# provide a timestamp and TZ, get the policy that includes this timestamp
policy_key = self.filter.get_policy_key(time=index_converted[data_index])
if policy_key in D:
D[policy_key].append(data['alert_value'][data_index])
active_policies.add(policy_key)
# compute and accumulate the thresholds for each Policy
T = {}
insufficient_data_policies = []
for policy_key in self.policies:
the_data = D[policy_key]
T[policy_key] = self.policies[policy_key].get_thresholds(the_data)
if T[policy_key] is None and policy_key in active_policies:
insufficient_data_policies.append(self.policies[policy_key].title)
logger.info(
"Insufficient data for threshold calculation: %d values." % len(D[policy_key]))
if len(insufficient_data_policies) > 0:
add_message(params['out_metadata'], 'WARN',
'insufficient data in ITSI summary index for policies %s' % str(insufficient_data_policies))
return T
def get_thresholds(self, data, params):
"""Computes thresholds for a KPI and this schedule.
:param data: dict with 'alert_value': list of floats
'_time': list of float epoch timestamps
:param params: dict with kpi settings
Returns a dict of lists of threshold level structures, keyed by policy.key;
the structures should have a populated `thresholdValue` field obtained from the result of the computation
"""
metadata = params['out_metadata']
thresholds = {}
kpi_info = 'kpiid="%s" on serviceid="%s"' % (str(params['kpi']['service_id']), str(params['kpi']['kpi_id']))
try:
thresholds = self._get_thresholds(data=data, params=params)
except ValueError as e:
params['logger'].exception(e)
log_and_warn(metadata=metadata, logger=params['logger'],
msg='Unconvertible alert_values found for ' + kpi_info,
search_msg="unconvertible values found (check this KPI's `alert_value` "
"field in ITSI summary index")
except AssertionError as e:
# Method should probably raise a ValueError/try to convert 0-100 to 0.0-1.0, but for now log nicely
params['logger'].exception(e)
log_and_warn(metadata=metadata, logger=params['logger'],
msg='Invalid quantile specified for %s, must be between 0.0 and 1.0' % kpi_info,
search_msg='invalid quantile value, must be between 0.0 and 1.0')
except Exception as e:
params['logger'].exception(e)
log_and_warn(metadata=metadata, logger=params['logger'],
msg='Unexpected exception when computing thresholds for %s' % kpi_info)
return thresholds
def parse_args(args, in_metadata, out_metadata, logger):
params = {}
params['use_kv_store'] = True
params['use_temp_collection'] = False
if 'nokv' in args:
params['use_kv_store'] = False
if 'usetempcollection' in args:
params['use_temp_collection'] = True
r = re.search('\s*collection\s*=\s*(?P<coll>\S+)\'', str(args))
if r is not None:
try:
params['temp_collection'] = r.group('coll')
logger.debug("Temporary collection name: %s" %
str(params['temp_collection']))
except:
log_and_die(metadata=out_metadata, logger=logger,
msg='Failed to parse temporary collection name in parameters.')
else:
log_and_die(metadata=out_metadata, logger=logger,
msg='Must provide a temporary collection name.')
r = re.search('\s*key\s*=\s*(?P<key>\S+)\'', str(args))
if r is not None:
try:
params['temp_key'] = r.group('key')
logger.debug("Temporary object key: %s" %
str(params['temp_key']))
except:
log_and_die(metadata=out_metadata, logger=logger,
msg='Failed to parse temporary object key in parameters.')
else:
log_and_die(metadata=out_metadata, logger=logger,
msg='Must provide a temporary object key.')
params['session_key'] = str(in_metadata['searchinfo']['session_key'])
if globals().get('ENABLE_FILE_ARGUMENT', False):
r = re.search('\s*file\s*=\s*(?P<fname>\S+)\'', str(args))
else:
r = None
if r is not None and not params['use_kv_store']:
try:
params['settings_file'] = r.group('fname')
logger.debug("Settings file: %s" % str(params['settings_file']))
except:
log_and_die(
metadata=out_metadata, logger=logger, msg='Failed to parse settings file in parameters.')
elif not params['use_kv_store']:
log_and_die(
metadata=out_metadata, logger=logger, msg='No settings file specified.')
if not params['use_kv_store'] and params['use_temp_collection']:
log_and_die(
metadata=out_metadata, logger=logger, msg="Incompatible arguments passed: nokv and usetempcollection.")
return params
def create_schedule(params):
policies = {}
metadata = params['out_metadata']
settings = params['kpi']['settings']
# get policy settings for this KPI, create Policy objects
for policy_key in settings['policies']:
t_method = str(
settings['policies'][policy_key]['policy_type'])
t_title = str(settings['policies'][policy_key].get('title', policy_key))
try:
t_levels = settings['policies'][policy_key]['aggregate_thresholds']['thresholdLevels']
except KeyError as e:
# we just skip this policy
logger.exception(e)
log_and_die(metadata=metadata, logger=logger, msg="Failed to retrieve aggregate levels: %s" % e)
policy_key = str(policy_key)
if t_method == 'static':
logger.info("Skipping static policy '%s'", policy_key)
elif not isinstance(t_levels, list) or not t_levels:
log_and_die(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': please specify threshold values "
"for the policy" % t_title)
else:
for x in t_levels:
if 'dynamicParam' not in x:
log_and_die(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': Missing threshold "
"value." % t_title)
try:
float(x['dynamicParam'])
except (TypeError, ValueError):
log_and_die(metadata=metadata, logger=logger,
msg="Unable to apply adaptive thresholding on policy '%s': Invalid threshold "
"value: %s" % (t_title, x['dynamicParam']))
logger.debug("Loading settings for policy %s: method=%s levels=%s" % (
policy_key, t_method, t_levels))
try:
policies[policy_key] = Policy(
key=policy_key, method=t_method, parameters=t_levels, title=t_title)
except ValueError as e:
logger.exception(e)
log_and_die(metadata=metadata, logger=logger, msg="Invalid arguments sent to Policy.")
the_schedule = None
if len(policies) == 0:
return
try:
the_schedule = Schedule(
kpi_object=params['kpi']['kpi_object'], policies=policies, threshold_spec=settings)
except ValueError as e:
logger.exception(e)
log_and_die(metadata=metadata, logger=logger, msg="Invalid arguments sent to Schedule.")
return the_schedule
def get_service_object(params):
service_object = None
if params['use_kv_store'] and not params['use_temp_collection']:
service_object = kpi.Service(logger=logger)
service_object.initialize_interface(
params['session_key'], owner='nobody')
return service_object
def get_kpi_object(params):
kpi_object = None
if params['use_kv_store']:
if params['use_temp_collection'] and params['temp_collection'] is not None and params['temp_key'] is not None:
kpi_object = kpi.TempKPI(logger=logger,
temp_collection_name=params['temp_collection'], temp_object_key=params['temp_key'])
else:
kpi_object = kpi.ServiceKPI(
logger=logger, service_data=params['kpi']['service_data'], kpi_id=params['kpi']['kpi_id'])
kpi_object.initialize_interface(
params['session_key'], owner='nobody', namespace='SA-ITOA')
kpi_object.fetch_kpi()
logger.debug(
"Initialized KV interface with session key %s" % params['session_key'])
elif params['settings_file'] is not None:
kpi_object = kpi.FileBackedKPI(
logger=logger, filename=params['settings_file'])
return kpi_object
def output_results(thresholds, params):
"""
thresholds: dict of lists of threshold levels structures, keyed by policy id
"""
settings = params['kpi']['settings']
service_id = params['kpi']['service_id']
kpi_id = params['kpi']['kpi_id']
for policy_id in thresholds:
t = thresholds[policy_id]
if t is not None:
if params['use_kv_store']:
if len(t) != len(settings['policies'][policy_id]['aggregate_thresholds']['thresholdLevels']):
kpistr = ""
if service_id is not None and kpi_id is not None and service_id != "" and kpi_id != "":
kpistr = " for kpi %s" % str(service_id) + ":" + str(kpi_id)
log_and_warn(metadata=params['out_metadata'], logger=params['logger'],
msg="Mismatched number of thresholdLevels%s. Generated %d but found %d." % (
kpistr, len(t),
len(settings['policies'][policy_id]['aggregate_thresholds']['thresholdLevels'])))
else:
# n.b. we assume thresholdLevels objects are
# sorted by increasing thresholdValue
# move this update_thresholds to outside
params['kpi']['kpi_object'].update_thresholds(
policy=policy_id, thresholds=t)
line = {
'policy_id': policy_id, 'itsi_service_id': service_id, 'itsi_kpi_id': kpi_id}
for thresh_index in range(len(t)):
line['threshold_' + str(thresh_index)] = t[thresh_index].get('thresholdValue')
line['threshold_metadata_' + str(thresh_index)] = t[thresh_index]
params['kpi']['writer'].writerow(line)
def main():
logger.debug(
"\n=========\nStarting ITSI adaptive thresholding.\n=========")
out_metadata = {}
out_metadata['inspector'] = {'messages': []}
# Phase 0: getinfo exchange
metadata, body = read_chunk(sys.stdin, logger)
# Don't run in preview.
if metadata.get('preview', False):
write_chunk(sys.stdout, {'finished': True}, '')
sys.exit(0)
args = str(metadata['searchinfo']['args'])
params = parse_args(
args=args, in_metadata=metadata, out_metadata=out_metadata, logger=logger)
params['logger'] = logger
params['out_metadata'] = out_metadata
params['out_metadata']['finished'] = False
fields_list = ['_time', 'itsi_service_id', 'itsi_kpi_id', 'alert_value']
params['out_metadata']['required_fields'] = fields_list
params['out_metadata']['type'] = 'reporting'
write_chunk(sys.stdout, params['out_metadata'], '')
params['out_metadata'].pop('type', None)
params['out_metadata'].pop('required_fields', None)
# Phase 1: gather the input data
kpidict = dict() # kpidict['itsi_service_id']['itsi_kpi_id']
while True:
params['out_metadata']['finished'] = False
ret = read_chunk(sys.stdin, logger)
if not ret:
break
metadata, body = ret
parse_input_data(
the_dict=kpidict, data=body, fields_list=fields_list, params=params)
write_chunk(sys.stdout, params['out_metadata'], '')
if metadata.get('finished', False):
break
def _ignore_invalid_row(warn_message):
"""
Method to log warning and ignore read row result
Assumes read_chunk was invoked before this method is invoked
@type: basestring
@param warn_message: warning message to log
@rtype: None
@return: None
"""
logger.warn(warn_message)
# Dummy response to ignore
write_chunk(sys.stdout, {"finished": False}, '')
# Get the service object
params['service_object'] = get_service_object(params)
# Bulk fetch the services of targeted kpis
if params['service_object']:
params['service_object'].bulk_fetch_service(kpidict.keys())
list_kpis = []
for itsi_service_id in kpidict:
for itsi_kpi_id in kpidict[itsi_service_id]:
list_kpis.append(itsi_kpi_id)
# Get the Active Custom Threshold Windows which are of type percentage
ctw_object = custom_threshold_window.CustomThresholdWindow(logger=logger)
ctw_object.initialize_interface(
params['session_key'], owner='nobody')
ctw_linked_kpis = ctw_object.bulk_fetch_active_ctw(list_kpis)
# Phase 2: iterate over (serviceid, kpiid) and output scores
for itsi_service_id in kpidict:
params['kpi'] = {
'service_id': itsi_service_id,
'service_data': None
}
if params['service_object']:
# save the service data
params['kpi']['service_data'] = params['service_object'].fetch_service(itsi_service_id)
for itsi_kpi_id in kpidict[itsi_service_id]:
params['kpi']['kpi_id'] = itsi_kpi_id
if not read_chunk(sys.stdin, logger):
break
# get the KPI object
params['kpi']['kpi_object'] = get_kpi_object(params)
if params['kpi']['kpi_object'] is None:
_ignore_invalid_row('No KPI found with id %s, ignoring ...' % itsi_kpi_id)
continue
# get the settings
kpi_tmp = params['kpi']['kpi_object'].get_kpi()
if not isinstance(kpi_tmp, dict):
_ignore_invalid_row('No valid KPI found with id %s, ignoring ...' % itsi_kpi_id)
continue
if 'time_variate_thresholds_specification' not in kpi_tmp:
_ignore_invalid_row(
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id
)
continue
params['kpi']['settings'] = kpi_tmp[
'time_variate_thresholds_specification']
if kpi_tmp['_key'] in ctw_linked_kpis and kpi_tmp['adaptive_thresholds_is_enabled']:
kpi_tmp['recalculate_custom_thresholds'] = True
if params['kpi']['settings'] is not None:
# create the schedule
the_schedule = create_schedule(params=params)
# prepare the data
values = clean_values(
data=kpidict[itsi_service_id][itsi_kpi_id],
params=params
)
# compute the thresholds
if the_schedule is not None:
thresholds = the_schedule.get_thresholds(
data=values, params=params)
else:
thresholds = {}
# prepare for generating output
params['out_metadata']['finished'] = False
outbuf = StringIO()
fields_list = ['policy_id']
for k in range(10):
fields_list.append("threshold_" + str(k))
fields_list.append("threshold_metadata_" + str(k))
fields_list = fields_list + ['itsi_service_id', 'itsi_kpi_id']
params['kpi']['writer'] = csv.DictWriter(
outbuf, fieldnames=fields_list, dialect='excel', extrasaction='ignore')
params['kpi']['writer'].writeheader()
# write output to buffer
output_results(thresholds=thresholds, params=params)
# output the results
write_chunk(
sys.stdout, params['out_metadata'], outbuf.getvalue())
else:
_ignore_invalid_row(
'No valid thresholds specification found for KPI with id %s, ignoring ...' % itsi_kpi_id
)
continue
# After updating thresholds to all services, do single rest to batch update the services
if params['service_object']:
params['service_object'].batch_update_services()
# we're done, so send dummy response to finish the session
ret = read_chunk(sys.stdin, logger)
if ret:
write_chunk(sys.stdout, {"finished": True}, '')
logger.debug(
"\n=========\nFinished ITSI adaptive thresholding.\n=========")
if __name__ == "__main__":
main()