# Copyright (C) 2005-2024 Splunk Inc. All Rights Reserved. import time import json from operator import itemgetter from SA_ITOA_app_common.solnlib.splunk_rest_client import SplunkRestClient import itsi_py3 from itsi.itsi_utils import ITOAInterfaceUtils from ITOA.setup_logging import logger from ITOA.itoa_common import get_current_utc_epoch from splunk.util import normalizeBoolean SHKPI_STARTS_WITH = 'SHKPI-' SHS_DEFAULT_URGENCY = 11 KPI_DEFAULT_URGENCY = 5 CRITICAL_SEVERITY = 6 HIGH_SEVERITY = 5 output_params = { 'output_mode': 'json', 'count': 0 } def get_epoch_times(session_key, earliest, latest): """ Convenience method to convert a set of latest and earliest to epoch @rtype: tuple(earliest_time_in_seconds, latest_time_in_seconds) @return: a tuple with the earliest and latest times in seconds """ try: uri_string = '/services/search/timeparser' _rest_client = SplunkRestClient(session_key, "SA-ITOA") response = _rest_client.get( uri_string, output_mode="json", time=[earliest, latest], output_time_format="%s" ).body.read() resp = json.loads(response) seconds_earliest = seconds_earliest_adjusted = float(resp[earliest]) seconds_latest = float(resp[latest]) logger.info( 'Result of getting earliest and latest times from timeparser: ' 'response=%s', response) # Will most likely occur each time since the minimum time range for SA is 45 minutes if seconds_earliest + 15 * 60 < seconds_latest: logger.info('The time range is too wide, adjusting earliest to 15 minutes before latest.') seconds_earliest_adjusted = seconds_latest - 15 * 60 return seconds_earliest_adjusted, seconds_latest except Exception: logger.error('Error in getting getting earliest and latest times from timeparser, defaults to 15 minutes.') latest_epoch_time = get_current_utc_epoch() return latest_epoch_time - 15 * 60, latest_epoch_time def generate_dependencies_json(session_key, data, service_id='', earliest_time='-60m', latest_time='now'): """ Entry point for generating list of kpi dependencies @type: string @param session_key: the session key for the search @type: dict @param data: the lookup dict from service id to service data @type: string @param service_id: the current service to develop the list from @type: string @param: earliest_time: the earliest time in the interval to search over @type: string @param: latest_time: the latest time in the interval to search over @rtype: list @return: a list of impacting kpis sorted in order by rank """ if not service_id: return [] service_data = {} for service in data: key = service['_key'] if not key: continue service_data[key] = service service_connection = ITOAInterfaceUtils.service_connection(session_key, app_name="itsi") visited = set() should_run_degraded_entities_search = True if not service_data.get(service_id) or not normalizeBoolean( service_data.get(service_id).get('is_healthscore_calculate_by_entity_enabled')): should_run_degraded_entities_search = False degraded_entities_data = {} if should_run_degraded_entities_search: earliest_epoch_time, latest_epoch_time = get_epoch_times(session_key, earliest_time, latest_time) degraded_entities_search = ('| mstats max(alert_level) AS alert_level WHERE `get_itsi_summary_metrics_index` ' 'AND is_service_aggregate=0 ' 'earliest=' + str(earliest_epoch_time) + ', latest=' + str(latest_epoch_time) + ' by itsi_service_id, itsi_kpi_id, entity_key, entity_title span=1m ' '| stats latest(alert_level) AS alert_level' ' by itsi_service_id, itsi_kpi_id, entity_key, entity_title ' '| where alert_level>4 | stats list(itsi_kpi_id) as itsi_kpi_id' ' by itsi_service_id, entity_key, entity_title, alert_level') params = { 'earliest_time': earliest_time, 'latest_time': latest_time } search_job = service_connection.jobs.create(degraded_entities_search, **params) if not wait_for_job(search_job): search_time_out_msg = "Search for degraded entities timed out" logger.error(search_time_out_msg) raise Exception(search_time_out_msg) search_results = json.load(search_job.results(**output_params)) results_data = search_results.get('results', []) logger.debug('Results for degraded entities search: ' + str(results_data)) for result in results_data: if result['entity_key'] != 'service_aggregate': entity_data = {} entity_data['alert_level'] = int(float(result['alert_level'])) entity_data['key'] = result['entity_key'] entity_data['title'] = result['entity_title'] if isinstance(result['itsi_kpi_id'], itsi_py3.string_type): kpis = [] kpis.append(result['itsi_kpi_id']) else: kpis = result['itsi_kpi_id'] for kpi_id in kpis: if kpi_id not in degraded_entities_data: degraded_entities_data[kpi_id] = [] degraded_entities_data[kpi_id].append(entity_data) logger.debug('Degraded entities data: ' + str(degraded_entities_data)) kpis = generate_impacting_dependencies_list_recursive(service_data, service_connection, visited, service_id, degraded_entities_data, earliest_time, latest_time) if not kpis or len(kpis) == 0: return [] output = sorted(kpis, key=itemgetter('impact')) # Limit number of results to 10 results = output[:10] if should_run_degraded_entities_search: for result in results: kpi_id = result['_key'] if kpi_id in degraded_entities_data: result['degraded_entities'] = sorted(degraded_entities_data[kpi_id], key=itemgetter('alert_level'), reverse=True) logger.debug('Generate KPIs dependencies results returned: ' + str(results)) return results def generate_impacting_dependencies_list_recursive(data, service_connection, visited, service_id, degraded_entities_data, earliest_time='-60m', latest_time='now'): """ For a given service, determine the kpis that have set its current health score and their relative impacts @type: dict @param data: the lookup dict from service id to service data @type: object @param service_connection: connection to the Splunk REST API @type: set @param visited: the set of tuples (from, to) denoting dependencies we have already visited @type: string @param service_id: the current service to develop the list from @type: dict @param degraded_entities_data: the dictionary containing degraded entities data @type: string @param: earliest_time: the earliest time in the interval to search over @type: string @param: latest_time: the latest time in the interval to search over @rtype: list @return: list of impacting kpis """ service = data.get(service_id) if not service: return [] contributing_kpis = {} net_urgency = 0 # first, iterate the kpis and dependencies of the service # and store them in a dict with their urgencies and associated service id kpis = service.get('kpis', []) for kpi in kpis: if not kpi['_key'].startswith(SHKPI_STARTS_WITH): urgency = int(kpi['urgency']) formatted_kpi = { '_key': kpi['_key'], 'is_healthscore_calculate_by_entity_enabled': service.get('is_healthscore_calculate_by_entity_enabled'), 'service_id': service_id, 'title': kpi['title'], 'urgency': urgency } contributing_kpis[kpi['_key']] = formatted_kpi net_urgency += urgency if urgency != 11 else 0 dependencies = service.get('services_depends_on', []) for service_dependency in dependencies: kpi_dependencies = service_dependency.get('kpis_depending_on', []) overloaded_urgencies = service_dependency.get('overloaded_urgencies', {}) dependent_service_id = service_dependency.get('serviceid', '') dependent_service = data.get(dependent_service_id, '') if dependent_service: # if the dependent service has not been deleted, loop over its dependencies for kpi in kpi_dependencies: # if the kpi is a SHS, it either has an overloaded urgency or defaults to 11 # if the kpi is a kpi, it either has an overloaded urgency or defaults to 5 default_urgency = SHS_DEFAULT_URGENCY if kpi.startswith(SHKPI_STARTS_WITH) else KPI_DEFAULT_URGENCY urgency = int(overloaded_urgencies.get(kpi, default_urgency)) # only sum the non boom urgencies net_urgency += urgency if urgency != 11 else 0 dependent_service_kpis = dependent_service.get('kpis') kpi_data = next((item for item in dependent_service_kpis if item['_key'] == kpi), {}) title = kpi_data.get('title', '') # if the dependent kpi has not been deleted, store its id, name, and urgency if title: svc_data = data.get(dependent_service_id) contributing_kpis[kpi] = { '_key': kpi, 'is_healthscore_calculate_by_entity_enabled': svc_data.get('is_healthscore_calculate_by_entity_enabled'), 'service_id': dependent_service_id, 'title': title, 'urgency': urgency } # build the search string using the keys of the parsed kpis kpi_search_string = 'itsi_kpi_id IN (' + ','.join(contributing_kpis.keys()) + ') ' spl_search = ('| mstats latest(alert_level) AS alert_level, latest(alert_value) AS alert_value' ' WHERE `get_itsi_summary_metrics_index` ' + kpi_search_string + ' `metrics_service_level_kpi_only` by itsi_kpi_id, itsi_service_id | where alert_level > -1 ' '| lookup kpi_alert_info_lookup alert_level OUTPUT weighted_contribution AS severity') params = { 'earliest_time': earliest_time, 'latest_time': latest_time } search_job = service_connection.jobs.create(spl_search, **params) if not wait_for_job(search_job): search_time_out_msg = "Search for dependent kpi severity timed out" logger.error(search_time_out_msg) raise Exception(search_time_out_msg) search_results = json.load(search_job.results(**output_params)) non_boom_sum = 0 non_boom_keys = [] min_boom_kpi = {'impact': float('inf')} list_min_boom_kpi = [] results_data = search_results.get('results', []) logger.debug('Results for KPI dependencies search: ' + str(results_data)) if len(results_data) == 0: logger.debug('generate_impacting_dependencies_list_recursive search did not return any results') return [] # After the search is complete, we compute individual severities for each KPI # Additionally, we note which boom kpi has the minimum severity and the sum of the non boom kpi impacts for row in results_data: kpi_result = row.get('itsi_kpi_id') ref_kpi = contributing_kpis.get(kpi_result, '') if not ref_kpi: continue ref_kpi['alert_value'] = row['alert_value'] if 'alert_value' in row else 'N/A' ref_kpi['severity'] = int(row['alert_level']) if ref_kpi.get('urgency') == 11 or kpi_result in degraded_entities_data: if kpi_result in degraded_entities_data: most_degraded_entity_severity = int(degraded_entities_data[kpi_result][0]['alert_level']) if (most_degraded_entity_severity == CRITICAL_SEVERITY or ref_kpi['severity'] == CRITICAL_SEVERITY): entity_severity = 0 else: entity_severity = 30 ref_kpi['impact'] = entity_severity else: # if the kpi is a boom kpi, store it if it has the minimum impact ref_kpi['impact'] = int(row['severity']) min_boom_kpi = min([ref_kpi, min_boom_kpi], key=itemgetter('impact')) if ref_kpi['impact'] == min_boom_kpi['impact']: list_min_boom_kpi.append(ref_kpi) else: list_min_boom_kpi = [] else: # calculate the relative impact for each non boom kpi, add this to the sum, store the key ref_kpi['impact'] = 0 if net_urgency == 0 else int(row['severity']) * ref_kpi['urgency'] / net_urgency non_boom_sum += ref_kpi['impact'] non_boom_keys.append(row['itsi_kpi_id']) if min_boom_kpi['impact'] > non_boom_sum and len(non_boom_keys) > 0: if net_urgency == 0: return [] # non boom kpis have determined the current SHS non_boom_kpi_output = {} for kpi_key in non_boom_keys: kpi = contributing_kpis[kpi_key] if kpi_key.startswith(SHKPI_STARTS_WITH): # skip the recursive step for cyclic dependencies path = (service_id, kpi_key) if path in visited: continue visited.add(path) dependent_kpis = generate_impacting_dependencies_list_recursive(data, service_connection, visited, kpi.get('service_id'), degraded_entities_data, earliest_time, latest_time) relative_urgency = kpi['urgency'] / net_urgency for dependent_kpi in dependent_kpis: # scale the impact of a nested kpi by its SHS's relative urgency dependent_kpi['impact'] *= relative_urgency if dependent_kpi['_key'] not in non_boom_kpi_output: non_boom_kpi_output[dependent_kpi['_key']] = dependent_kpi else: # if the kpi is already present, we add its impacts together previous_impact = non_boom_kpi_output[kpi_key]['impact'] dependent_kpi['impact'] += previous_impact non_boom_kpi_output[dependent_kpi['_key']] = dependent_kpi else: if kpi.get('_key') not in non_boom_kpi_output: non_boom_kpi_output[kpi_key] = kpi else: # if the kpi is already present, we add its impacts together previous_impact = non_boom_kpi_output[kpi_key]['impact'] kpi['impact'] += previous_impact non_boom_kpi_output[kpi_key] = kpi logger.debug('generate_impacting_dependencies_list_recursive results returned: ' + str(non_boom_kpi_output)) # convert the output to a list and return return list(non_boom_kpi_output.values()) else: # a boom kpi has determined the current SHS if min_boom_kpi.get('_key').startswith(SHKPI_STARTS_WITH): # skip the recursive step for cyclic dependencies path = (service_id, min_boom_kpi.get('_key')) if path in visited: return [] visited.add(path) # if the boom kpi is a service health score, we only want to return the kpis that comprise it return generate_impacting_dependencies_list_recursive(data, service_connection, visited, min_boom_kpi.get('service_id'), degraded_entities_data, earliest_time, latest_time) if len(list_min_boom_kpi) > 0: logger.debug('generate_impacting_dependencies_list_recursive results as list: ' + str(list_min_boom_kpi)) return list_min_boom_kpi logger.debug('generate_impacting_dependencies_list_recursive results as single value: ' + str(min_boom_kpi)) return [min_boom_kpi] def wait_for_job(searchjob, maxtime=-1): """ Wait up to maxtime seconds for searchjob to finish. Returns true, if job finished. """ pause = 0.2 lapsed = 0.0 while not searchjob.is_done(): time.sleep(pause) lapsed += pause if maxtime >= 0 and lapsed > maxtime: break return searchjob.is_done()