#!/usr/bin/env python # coding=utf-8 __name__ = "trackme_rest_handler_component.py" __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Built-in libraries import json import os import sys import time import requests # splunk home splunkhome = os.environ["SPLUNK_HOME"] # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # set logging from trackme_libs_logging import setup_logger logger = setup_logger( "trackme.rest.component_user", "trackme_rest_api_component_user.log" ) # Redirect global logging to use the same handler import logging logging.getLogger().handlers = logger.handlers logging.getLogger().setLevel(logger.level) # import rest handler import trackme_rest_handler # import trackme libs from trackme_libs import trackme_getloglevel, trackme_vtenant_account, trackme_reqinfo # import trackme libs utils from trackme_libs_utils import replace_encoded_backslashes, get_uuid # import Splunk libs import splunklib.client as client # import TrackMe get data libs from trackme_libs_get_data import ( search_kv_collection, get_target_from_kv_collection, get_sampling_kv_collection, get_collection_documents_count, get_wlk_apps_enablement_kv_collection, get_feeds_datagen_kv_collection, search_kv_collection_restmode, search_kv_collection_searchmode, search_kv_collection_sdkmode, ) # import TrackMe decision maker libs from trackme_libs_decisionmaker import ( pre_filter_records, filter_records, convert_epoch_to_datetime, get_monitoring_time_status, get_outliers_status, get_data_sampling_status, get_future_status, get_future_metrics_status, get_is_under_dcount_host, get_logical_groups_collection_records, get_dsm_latency_status, get_dsm_delay_status, set_dsm_status, set_dhm_status, set_mhm_status, set_flx_status, set_fqm_status, set_wlk_status, ack_check, define_state_icon_code, outliers_readiness, logical_group_lookup, set_feeds_lag_summary, set_feeds_thresholds_duration, dsm_sampling_lookup, outliers_data_lookup, sampling_anomaly_status, get_coll_docs_ref, docs_ref_lookup, wlk_disabled_apps_lookup, wlk_versioning_lookup, wlk_orphan_lookup, apply_blocklist, dsm_check_default_thresholds, dhm_check_default_thresholds, dynamic_priority_lookup, dynamic_tags_lookup, dynamic_sla_class_lookup, get_sla_timer, flx_thresholds_lookup, fqm_thresholds_lookup, flx_check_dynamic_thresholds, fqm_check_dynamic_thresholds, flx_drilldown_searches_lookup, flx_default_metrics_lookup, calculate_score, ) # import trackme libs disruption queue from trackme_libs_disruption_queue import ( disruption_queue_lookup, disruption_queue_update, disruption_queue_get_duration, ) # import chart generation functions from stateful alert helper from modalert_trackme_stateful_alert_helper import ( get_chart_search, get_mlmodels_from_kvstore, flx_get_metrics_catalog_for_object_id, fqm_get_metrics_catalog_for_object_id, wlk_get_metrics_catalog_for_object_id, remove_leading_spaces, ) class TrackMeHandlerComponentRead_v2(trackme_rest_handler.RESTHandler): def __init__(self, command_line, command_arg): super(TrackMeHandlerComponentRead_v2, self).__init__( command_line, command_arg, logger ) def get_resource_group_desc_component(self, request_info, **kwargs): response = { "resource_group_name": "component", "resource_group_desc": "Endpoints specific to TrackMe's components data offload (read only operations)", } return {"payload": response, "status": 200} # Get the component data with pagination and progressive load capabilities def get_load_component_data(self, request_info, **kwargs): describe = False try: params_dict = request_info.raw_args["query_parameters"] except Exception as e: params_dict = None try: resp_dict = json.loads(str(request_info.raw_args["payload"])) except Exception as e: resp_dict = None logger.info( f'function get_load_component_data called, params_dict="{params_dict}"' ) # Start performance counter start = time.time() if resp_dict is not None: try: describe = resp_dict["describe"] if describe in ("true", "True"): describe = True except Exception as e: describe = False if not params_dict and not resp_dict: describe = True # if describe is requested, show the usage if describe: response = { "describe": "This endpoint retrieves a TrackMe's component table data, it requires a GET call using params and the following options:", "resource_desc": "Get TrackMe's component data", "resource_spl_example": "| trackme url=\"/services/trackme/v2/component/load_component_data\" mode=\"get\" params=\"{'tenant_id': 'mytenant', 'component': 'flx', 'page': 1, 'size': 100}\"", "options": [ { "tenant_id": "(required) tenant identifier", "component": "(required) component identifier, valid options are: flx, dsm, dhm, mhm, wlk, fqm", "filter_object": "(optional) target a specific TrackMe object record, do not specify this for no filtering", "filter_key": "(optional) target a specific TrackMe key record, do not specify this for no filtering", "filter_objects": "(optional) comma-separated list of TrackMe object records to filter on, do not specify this for no filtering", "filter_keys": "(optional) comma-separated list of TrackMe key records to filter on, do not specify this for no filtering", "pagination_mode": "(optional) set to true to enable pagination, valid options are: local, remote. Defaults to remote.", "page": "(optional) page number, specific the page to be retrieved, defaults to page 1", "size": "(optional) number of records to retrieve, set to 0 with page: 1 to retrieve all records in a single operation", "mode_view": "(optional) for splk-dhm/splk-mhm/splk-wlk, the view mode, defaults to minimal, valid options are: minimal, compact, full", "load_charts_resources": "(optional) set to true to load the charts resources, defaults to false", } ], } return {"payload": response, "status": 200} if params_dict is not None: # tenant_id try: tenant_id = params_dict["tenant_id"] except Exception as e: return { "payload": { "action": "failure", "response": "the tenant_id is required", }, "status": 400, } # component try: component = params_dict["component"] except Exception as e: return { "payload": { "action": "failure", "response": "the component is required", }, "status": 400, } # pagination_mode, optional and defaults to False if not specified try: pagination_mode = params_dict["pagination_mode"] if pagination_mode not in ("local", "remote"): return { "payload": { "action": "failure", "response": "the pagination_mode is invalid", }, "status": 400, } except Exception as e: pagination_mode = "remote" # filter_object, optional and defaults to None if not specified try: filter_object = params_dict["filter_object"] except Exception as e: filter_object = None # filter_key, optional and defaults to None if not specified try: filter_key = params_dict["filter_key"] except Exception as e: filter_key = None # filter_objects, optional and defaults to None if not specified try: filter_objects = params_dict["filter_objects"] if filter_objects: filter_objects = [obj.strip() for obj in filter_objects.split(",")] except Exception as e: filter_objects = None # filter_keys, optional and defaults to None if not specified try: filter_keys = params_dict["filter_keys"] if filter_keys: filter_keys = [key.strip() for key in filter_keys.split(",")] except Exception as e: filter_keys = None # page, if not submitted, default to 1 try: page = int(params_dict["page"]) except Exception as e: page = 1 # size, if not submitted, default to 0 try: size = int(params_dict["size"]) except Exception as e: size = 0 # mode_view try: mode_view = params_dict["mode_view"] except Exception as e: mode_view = "minimal" logger.debug(f'mode_view="{mode_view}"') # load_charts_resources (accepts boolean, true or false as strings, 0 or 1 as integers or strings) try: load_charts_resources = params_dict["load_charts_resources"] if isinstance(load_charts_resources, str): if load_charts_resources in ("true", "True", "1"): load_charts_resources = True elif load_charts_resources in ("false", "False", "0"): load_charts_resources = False elif isinstance(load_charts_resources, int): if load_charts_resources == 1: load_charts_resources = True elif load_charts_resources == 0: load_charts_resources = False except Exception as e: load_charts_resources = False # Get splunkd port splunkd_port = request_info.server_rest_port # Get service service = client.connect( owner="nobody", app="trackme", port=splunkd_port, token=request_info.session_key, timeout=600, ) # set loglevel loglevel = trackme_getloglevel( request_info.system_authtoken, request_info.server_rest_port ) logger.setLevel(loglevel) # set instance_id self.instance_id = get_uuid() # Get trackmeconf trackme_conf = trackme_reqinfo( request_info.system_authtoken, request_info.server_rest_uri )["trackme_conf"] # Get virtual tenant account vtenant_conf = trackme_vtenant_account( request_info.system_authtoken, request_info.server_rest_uri, tenant_id, ) # # System level settings # system_future_tolerance = float( trackme_conf["splk_general"]["splk_general_feeds_future_tolerance"] ) # # System level default minimal disruption period # default_disruption_min_time_sec = int( vtenant_conf["default_disruption_min_time_sec"] ) # # Tenant level default monitoring time policy # try: default_monitoring_time_policy = vtenant_conf["monitoring_time_policy"] except Exception as e: default_monitoring_time_policy = "all_time" # # SLA timer # sla_classes = {} sla_default_class = None sla_classes = trackme_conf["sla"]["sla_classes"] # try loading the JSON try: sla_classes = json.loads(sla_classes) sla_default_class = trackme_conf["sla"]["sla_default_class"] if not len(sla_default_class) > 0 or sla_default_class not in sla_classes: sla_default_class = "silver" logger.error( f'instance_id={self.instance_id}, Invalid sla_default_class="{sla_default_class}", this SLA class is not part of the SLA classes, applying fallback configuration' ) except: logger.error( f'instance_id={self.instance_id}, Error loading sla_classes JSON, please check the configuration, the JSON is not valid JSON, applying fallback configuration, exception="{str(e)}"' ) sla_classes = json.loads( '{"gold": {"sla_threshold": 14400, "rank": 3}, "silver": {"sla_threshold": 86400, "rank": 2}, "platinum": {"sla_threshold": 172800, "rank": 1}}' ) sla_default_class = "silver" # retrieve the score for the tenant and component scores_dict = calculate_score(service, tenant_id, component) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", component="{component}", scores_dict="{json.dumps(scores_dict, indent=2)}"' ) # dsm specific if component == "dsm": # docs references docs_is_global = "False" # doc_note_global docs_note_global = trackme_conf["splk_general"][ "splk_general_dsm_docs_note_global" ] if not docs_note_global: docs_note_global = "N/A" # docs_link_global docs_link_global = trackme_conf["splk_general"][ "splk_general_dsm_docs_link_global" ] if not docs_link_global: docs_link_global = "N/A" # both should be defined to be enabled if docs_note_global == "N/A" or docs_link_global == "N/A": docs_note_global = "N/A" docs_link_global = "N/A" else: docs_is_global = "True" # dhm specific if component == "dhm": macro_name = f"trackme_dhm_default_splk_dhm_alert_policy_tenant_{tenant_id}" macro_current = service.confs["macros"][macro_name] default_splk_dhm_alerting_policy = macro_current.content.get("definition") # remove double quotes from default_splk_dhm_alerting_policy default_splk_dhm_alerting_policy = default_splk_dhm_alerting_policy.replace( '"', "" ) logger.debug( f'instance_id={self.instance_id}, default_splk_dhm_alerting_policy="{default_splk_dhm_alerting_policy}"' ) # # splk-flx specific collections # if component == "flx": # Thresholds thresholds_collection_name = f"kv_trackme_flx_thresholds_tenant_{tenant_id}" thresholds_collection = service.kvstore[thresholds_collection_name] ( thresholds_records, thresholds_collection_keys, thresholds_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, thresholds_collection_name, page=1, page_count=0, orderby="keyid" ) logger.debug( f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"' ) # Drilldown searches drilldown_searches_collection_name = f"kv_trackme_flx_drilldown_searches_tenant_{tenant_id}" try: drilldown_searches_collection = service.kvstore[drilldown_searches_collection_name] ( drilldown_searches_records, drilldown_searches_collection_keys, drilldown_searches_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, drilldown_searches_collection_name, page=1, page_count=0, orderby="keyid" ) except Exception as e: logger.debug(f"instance_id={self.instance_id}, Drilldown searches collection not found or accessible: {str(e)}") drilldown_searches_records = [] drilldown_searches_collection_keys = [] drilldown_searches_collection_dict = {} logger.debug( f'instance_id={self.instance_id}, drilldown_searches_collection_dict="{json.dumps(drilldown_searches_collection_dict, indent=2)}"' ) # Default metrics default_metrics_collection_name = f"kv_trackme_flx_default_metric_tenant_{tenant_id}" try: default_metrics_collection = service.kvstore[default_metrics_collection_name] ( default_metrics_records, default_metrics_collection_keys, default_metrics_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, default_metrics_collection_name, page=1, page_count=0, orderby="keyid" ) except Exception as e: logger.debug(f"instance_id={self.instance_id}, Default metrics collection not found or accessible: {str(e)}") default_metrics_records = [] default_metrics_collection_keys = [] default_metrics_collection_dict = {} logger.debug( f'instance_id={self.instance_id}, default_metrics_collection_dict="{json.dumps(default_metrics_collection_dict, indent=2)}"' ) # # splk-fqm specific collections # if component == "fqm": # Thresholds thresholds_collection_name = f"kv_trackme_fqm_thresholds_tenant_{tenant_id}" thresholds_collection = service.kvstore[thresholds_collection_name] ( thresholds_records, thresholds_collection_keys, thresholds_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, thresholds_collection_name, page=1, page_count=0, orderby="keyid" ) logger.debug( f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"' ) # # Virtual tenant account settings # # outliers tenant level settings (deprecated - kept for backward compatibility) # These are no longer used with score-based approach, but kept for backward compatibility tenant_outliers_set_state = int(vtenant_conf.get("outliers_set_state", 1)) tenant_data_sampling_set_state = int(vtenant_conf.get("data_sampling_set_state", 1)) # # Logical groups collection records # logical_group_coll = service.kvstore[ f"kv_trackme_common_logical_group_tenant_{tenant_id}" ] ( logical_coll_records, logical_coll_dict, logical_coll_members_list, logical_coll_members_dict, logical_coll_count, ) = get_logical_groups_collection_records(logical_group_coll) # log debug logger.debug( f'instance_id={self.instance_id}, function get_logical_groups_collection_records, logical_coll_dict="{json.dumps(logical_coll_dict, indent=2)}", logical_coll_count="{logical_coll_count}"' ) # entities KV collection data_collection_name = f"kv_trackme_{component}_tenant_{tenant_id}" data_collection = service.kvstore[data_collection_name] # detect if we have multiple filters, if we do, set size to 0 as we need to retrieve all records multiple_filters = False query_parameters = request_info.raw_args["query_parameters"] if params_dict: # Loop through all query parameters for key, value in query_parameters.items(): if "filter[" in key: if key == "filter[1][field]": multiple_filters = True if multiple_filters: size = 0 if ( not filter_object and not filter_key and not filter_objects and not filter_keys ): # get records if size == 0: func_start = time.time() data_records, data_collection_keys, data_collection_dict, last_page = ( search_kv_collection( service, data_collection_name, page=1, page_count=0, ) ) last_page = 1 logger.info( f"instance_id={self.instance_id}, function search_kv_collection took {round(time.time() - func_start, 2)} seconds, records_count={len(data_records)}" ) else: func_start = time.time() data_records, data_collection_keys, data_collection_dict, last_page = ( search_kv_collection( service, data_collection_name, page=page, page_count=size, ) ) logger.info( f"instance_id={self.instance_id}, function search_kv_collection took {round(time.time() - func_start, 2)} seconds, records_count={len(data_records)}" ) elif filter_object: # filter on a given object data_records, data_collection_keys, data_collection_dict = ( get_target_from_kv_collection( "object", filter_object, data_collection, data_collection_name ) ) last_page = 1 total_record_count = len(data_records) elif filter_key: # filter on a given key data_records, data_collection_keys, data_collection_dict = ( get_target_from_kv_collection( "_key", filter_key, data_collection, data_collection_name ) ) last_page = 1 total_record_count = len(data_records) elif filter_objects: # filter on multiple objects data_records, data_collection_keys, data_collection_dict = ( get_target_from_kv_collection( "object", filter_objects, data_collection, data_collection_name ) ) last_page = 1 total_record_count = len(data_records) elif filter_keys: # filter on multiple keys data_records, data_collection_keys, data_collection_dict = ( get_target_from_kv_collection( "_key", filter_keys, data_collection, data_collection_name ) ) last_page = 1 total_record_count = len(data_records) # for later usage total_record_count = len(data_records) # get Ack collection ack_collection_name = f"kv_trackme_common_alerts_ack_tenant_{tenant_id}" ack_collection = service.kvstore[ack_collection_name] ( ack_records, ack_collection_keys, ack_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, ack_collection_name, page=1, page_count=0, orderby="object" ) # get priority collection priority_collection_name = f"kv_trackme_{component}_priority_tenant_{tenant_id}" priority_collection = service.kvstore[priority_collection_name] ( priority_records, priority_collection_keys, priority_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, priority_collection_name, page=1, page_count=0, orderby="keyid" ) # get tags collection tags_collection_name = f"kv_trackme_{component}_tags_tenant_{tenant_id}" tags_collection = service.kvstore[tags_collection_name] ( tags_records, tags_collection_keys, tags_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, tags_collection_name, page=1, page_count=0, orderby="keyid" ) # get sla collection sla_collection_name = f"kv_trackme_{component}_sla_tenant_{tenant_id}" sla_collection = service.kvstore[sla_collection_name] ( sla_records, sla_collection_keys, sla_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, sla_collection_name, page=1, page_count=0, orderby="keyid" ) # get disruption queue collection disruption_queue_collection_name = ( f"kv_trackme_common_disruption_queue_tenant_{tenant_id}" ) disruption_queue_collection = service.kvstore[disruption_queue_collection_name] ( disruption_queue_records, disruption_queue_collection_keys, disruption_queue_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, disruption_queue_collection_name, page=1, page_count=0, orderby="keyid" ) logger.debug( f'instance_id={self.instance_id}, disruption_queue_collection_dict="{json.dumps(disruption_queue_collection_dict, indent=2)}"' ) # get outliers data (all components except mhm) if component not in ["mhm"]: # data collection outliers_data_collection_name = ( f"kv_trackme_{component}_outliers_entity_data_tenant_{tenant_id}" ) outliers_data_collection = service.kvstore[outliers_data_collection_name] ( outliers_data_records, outliers_data_collection_keys, outliers_data_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, outliers_data_collection_name, page=1, page_count=0, orderby="keyid" ) # rules collection outliers_rules_collection_name = ( f"kv_trackme_{component}_outliers_entity_rules_tenant_{tenant_id}" ) outliers_rules_collection = service.kvstore[outliers_rules_collection_name] ( outliers_rules_records, outliers_rules_collection_keys, outliers_rules_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, outliers_rules_collection_name, page=1, page_count=0, orderby="keyid" ) # # component specific collections # if component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk"]: # datagen datagen_collection_name = ( f"kv_trackme_{component}_allowlist_tenant_{tenant_id}" ) datagen_collection = service.kvstore[datagen_collection_name] ( datagen_records, datagen_collection_keys, datagen_collection_dict, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) = get_feeds_datagen_kv_collection( datagen_collection, datagen_collection_name, component ) logger.debug( f'instance_id={self.instance_id}, datagen_collection_dict="{json.dumps(datagen_collection_dict, indent=2)}"' ) logger.debug( f'instance_id={self.instance_id}, datagen_collection_blocklist_not_regex_dict="{json.dumps(datagen_collection_blocklist_not_regex_dict, indent=2)}"' ) logger.debug( f'instance_id={self.instance_id}, datagen_collection_blocklist_regex_dict="{json.dumps(datagen_collection_blocklist_regex_dict, indent=2)}"' ) # # splk-dsm specific collections # if component == "dsm": # Data sampling sampling_collection_name = ( f"kv_trackme_dsm_data_sampling_tenant_{tenant_id}" ) sampling_collection = service.kvstore[sampling_collection_name] sampling_records, sampling_collection_keys, sampling_collection_dict = ( get_sampling_kv_collection( sampling_collection, sampling_collection_name ) ) # Docs reference docs_collection_name = f"kv_trackme_dsm_knowledge_tenant_{tenant_id}" docs_collection = service.kvstore[docs_collection_name] ( docs_collection_records, docs_collection_records_dict, docs_collection_members_list, docs_collection_members_dict, ) = get_coll_docs_ref(docs_collection, docs_collection_name) logger.debug( f'instance_id={self.instance_id}, docs_collection_dict="{json.dumps(docs_collection_members_dict, indent=2)}"' ) # # splk-wlk specific collections # if component == "wlk": # apps_disabled apps_enablement_collection_name = ( f"kv_trackme_wlk_apps_enablement_tenant_{tenant_id}" ) apps_enablement_collection = service.kvstore[ apps_enablement_collection_name ] ( apps_enablement_records, apps_enablement_collection_keys, apps_enablement_collection_dict, ) = get_wlk_apps_enablement_kv_collection( apps_enablement_collection, apps_enablement_collection_name ) logger.debug( f'instance_id={self.instance_id}, apps_enablement_collection_dict="{json.dumps(apps_enablement_collection_dict, indent=2)}"' ) # versioning versioning_collection_name = f"kv_trackme_wlk_versioning_tenant_{tenant_id}" versioning_collection = service.kvstore[versioning_collection_name] ( versioning_records, versioning_collection_keys, versioning_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, versioning_collection_name, page=1, page_count=0, orderby="keyid" ) logger.debug( f'instance_id={self.instance_id}, versioning_collection_dict="{json.dumps(versioning_collection_dict, indent=2)}"' ) # orphan orphan_collection_name = f"kv_trackme_wlk_orphan_status_tenant_{tenant_id}" orphan_collection = service.kvstore[orphan_collection_name] ( orphan_records, orphan_collection_keys, orphan_collection_dict, last_page, ) = search_kv_collection_sdkmode( logger, service, orphan_collection_name, page=1, page_count=0, orderby="keyid" ) logger.debug( f'instance_id={self.instance_id}, orphan_collection_dict="{json.dumps(orphan_collection_dict, indent=2)}"' ) # A list to store processed records processed_records = [] # Process records through TrackMe's decision maker workflow records_count = 0 # filter records - server side filters not working for now query_parameters_json = request_info.raw_args["query_parameters"] logger.info( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", component="{component}", received query_parameters_json="{json.dumps(query_parameters_json, indent=2)}"' ) # pre-filtered records prefiltered_records = pre_filter_records(data_records, query_parameters_json) # loop for record in prefiltered_records: records_count += 1 try: logger.debug(f"instance_id={self.instance_id}, processing record") # append_record boolean, True by default unless specific use cases append_record = True # get object_value and key object_value = record.get("object", None) logger.debug( f'instance_id={self.instance_id}, object="{object_value}", record="{json.dumps(record, indent=2)}"' ) # save the current value of object_state in the record as kvcurrent_object_state, we manipulate real state calculations # and we need the original state in some conditions (sla) record["kvcurrent_object_state"] = record.get("object_state", "N/A") # get the KVsotre unique key and add to the record as keyid key_value = record.get("_key", None) record["keyid"] = key_value # get the score for the object and add to the record try: score = int(scores_dict.get(key_value, {}).get("score", 0)) except: score = 0 try: score_outliers = int(scores_dict.get(key_value, {}).get("score_outliers", 0)) except: score_outliers = 0 record["score_outliers"] = score_outliers try: score_source = scores_dict.get(key_value, {}).get("score_source", []) except: score_source = [] record["score"] = score record["score_source"] = score_source # ensure alias has not encoded backslashes record["alias"] = replace_encoded_backslashes(record.get("alias", "")) # # logical group lookup # if component not in ["wlk"]: logical_group_lookup( object_value, logical_coll_members_list, logical_coll_members_dict, record, ) # # some safety checks for feeds (dsm/dhm) # if component in ["dsm"]: dsm_check_default_thresholds(record, trackme_conf) elif component in ["dhm"]: dhm_check_default_thresholds(record, trackme_conf) # # Check Ack # # Call ack_check function ack_check( object_value, ack_collection_keys, ack_collection_dict, record, ) # # Dynamic priority # dynamic_priority_lookup( key_value, priority_collection_keys, priority_collection_dict, record, ) # # Dynamic tags # dynamic_tags_lookup( key_value, tags_collection_keys, tags_collection_dict, record, ) # # Dynamic sla_class # dynamic_sla_class_lookup( key_value, sla_collection_keys, sla_collection_dict, record, ) # # Disruption queue # # Aggregate disruption_min_time_sec: take maximum value across all trackers aggregated_disruption_min_time_sec = default_disruption_min_time_sec if "disruption_min_time_sec" in record: try: disruption_min_time_value = record.get("disruption_min_time_sec") if disruption_min_time_value: disruption_times_by_tracker = None # Parse if it's a JSON string if isinstance(disruption_min_time_value, str): try: disruption_times_by_tracker = json.loads(disruption_min_time_value) except (json.JSONDecodeError, TypeError): # If parsing fails, might be old format numeric value try: aggregated_disruption_min_time_sec = max( default_disruption_min_time_sec, int(float(disruption_min_time_value)) ) except (ValueError, TypeError): pass elif isinstance(disruption_min_time_value, dict): disruption_times_by_tracker = disruption_min_time_value else: # Numeric value (old format) try: aggregated_disruption_min_time_sec = max( default_disruption_min_time_sec, int(float(disruption_min_time_value)) ) except (ValueError, TypeError): pass # If tracker-keyed format, take maximum across all trackers if disruption_times_by_tracker and isinstance(disruption_times_by_tracker, dict): max_disruption_time = max( int(float(v)) for v in disruption_times_by_tracker.values() ) aggregated_disruption_min_time_sec = max( default_disruption_min_time_sec, max_disruption_time ) except Exception as e: logger.error( f'instance_id={self.instance_id}, failed to aggregate disruption_min_time_sec for object="{object_value}", ' f'exception="{str(e)}"' ) disruption_queue_record = disruption_queue_lookup( key_value, disruption_queue_collection_keys, disruption_queue_collection_dict, aggregated_disruption_min_time_sec, ) # # Outliers status (all components except mhm) # if component not in ["mhm"]: outliers_data_lookup( key_value, outliers_data_collection_keys, outliers_data_collection_dict, outliers_rules_collection_keys, outliers_rules_collection_dict, record, ) # # Outliers readiness # outliers_readiness(record) # # Human time fields context # record["latest_flip_time (translated)"] = convert_epoch_to_datetime( record.get("latest_flip_time", "0") ) record["tracker_runtime (translated)"] = convert_epoch_to_datetime( record.get("tracker_runtime", "0") ) # # tags field, if not existing in record, set to "N/A" # tags_auto = record.get("tags_auto", []) tags_manual = record.get("tags_manual", []) if tags_auto: # if tags_auto is a string, convert to a list if isinstance(tags_auto, str): tags_auto = tags_auto.split(",") else: tags_auto = [] # add to record record["tags_auto"] = tags_auto if tags_manual: # if tags_manual is a string, convert to a list if isinstance(tags_manual, str): tags_manual = tags_manual.split(",") else: tags_manual = [] # add to record record["tags_manual"] = tags_manual # merge tags_auto and tags_manual into tags tags = sorted( list(set([x.lower() for x in tags_auto + tags_manual if x])) ) # finally, set the tags field if not existing if not tags: record["tags"] = "N/A" else: record["tags"] = tags # # splk-dsm # # get record fields depending on the component if component == "dsm": # first check blocklist if ( datagen_collection_blocklist_not_regex_dict or datagen_collection_blocklist_regex_dict ): append_record = apply_blocklist( record, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) if append_record: # refresh data_last_lag_seen in the record try: record["data_last_lag_seen"] = time.time() - float( record.get("data_last_time_seen", 0) ) except: record["data_last_lag_seen"] = 0 # get outliers and data sampling try: isOutlier = int(record.get("isOutlier", 0)) except: isOutlier = 0 try: OutliersDisabled = int(record.get("OutliersDisabled", 0)) except: OutliersDisabled = 0 try: isAnomaly = int(record.get("isAnomaly", 0)) except: isAnomaly = 0 logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"' ) # get future_tolerance future_tolerance = record.get("future_tolerance", 0) try: future_tolerance = float(future_tolerance) except: future_tolerance = 0 # get actual primary KPI values data_last_ingestion_lag_seen = record.get( "data_last_ingestion_lag_seen", 0 ) if data_last_ingestion_lag_seen == "": data_last_ingestion_lag_seen = 0 try: data_last_ingestion_lag_seen = float( data_last_ingestion_lag_seen ) except: data_last_ingestion_lag_seen = 0 data_last_lag_seen = record.get("data_last_lag_seen", 0) # get per entity thresholds data_max_lag_allowed = float( record.get("data_max_lag_allowed", 0) ) data_max_delay_allowed = float( record.get("data_max_delay_allowed", 0) ) min_dcount_threshold = record.get("min_dcount_threshold", 0) try: min_dcount_threshold = float(min_dcount_threshold) except: min_dcount_threshold = 0 # get dcount host related information min_dcount_host = record.get("min_dcount_host", "any") try: min_dcount_host = float(min_dcount_host) except: pass min_dcount_field = record.get("min_dcount_field", None) # get monitoring time policy and rules (new fields) monitoring_time_policy = record.get("monitoring_time_policy", None) # if unset yet, use the tenant level and add to the record if monitoring_time_policy is None or len(monitoring_time_policy) == 0: monitoring_time_policy = default_monitoring_time_policy record["monitoring_time_policy"] = default_monitoring_time_policy monitoring_time_rules = record.get("monitoring_time_rules", None) # Get logical group information # get logical group information: object_group_key object_group_key = record.get("object_group_key", "") # from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function object_logical_group_dict = logical_coll_dict.get( object_group_key, {} ) # get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime) data_last_ingest = record.get("data_last_ingest", 0) try: data_last_ingest = float(data_last_ingest) except: pass data_last_time_seen = record.get("data_last_time_seen", 0) if data_last_time_seen == "": data_last_time_seen = 0 try: data_last_time_seen = float(data_last_time_seen) except: data_last_time_seen = 0 data_last_time_seen_idx = record.get( "data_last_time_seen_idx", 0 ) try: data_last_time_seen_idx = float(data_last_time_seen_idx) except: pass # call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg ( isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg, ) = get_monitoring_time_status( monitoring_time_policy, monitoring_time_rules, ) # call get_outliers_status and define isOutlier (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 921-933) isOutlier = get_outliers_status( isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"' ) # # DSM Sampling # # call function dsm_sampling_lookup dsm_sampling_lookup( object_value, sampling_collection_keys, sampling_collection_dict, record, ) # call get_data_sampling_status and define isAnomaly isAnomaly = get_data_sampling_status( record.get("data_sample_status_colour"), record.get("data_sample_feature"), tenant_data_sampling_set_state, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isAnomaly="{isAnomaly}", tenant_data_sampling_set_state="{tenant_data_sampling_set_state}"' ) # call get_future_status and define isFuture ( isFuture, isFutureMsg, merged_future_tolerance, ) = get_future_status( future_tolerance, system_future_tolerance, data_last_lag_seen, data_last_ingestion_lag_seen, data_last_time_seen, data_last_ingest, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"' ) # call get_is_under_dcount_host and define isUnderDcountHost ( isUnderDcountHost, isUnderDcountHostMsg, ) = get_is_under_dcount_host( min_dcount_host, min_dcount_threshold, min_dcount_field ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDcountHost="{isUnderDcountHost}", isUnderDcountHostMsg="{isUnderDcountHostMsg}", min_dcount_host="{min_dcount_host}", min_dcount_threshold="{min_dcount_threshold}"' ) # call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage ( isUnderLatencyAlert, isUnderLatencyMessage, ) = get_dsm_latency_status( data_last_ingestion_lag_seen, data_max_lag_allowed, data_last_ingest, data_last_time_seen, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"' ) # call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage ( isUnderDelayAlert, isUnderDelayMessage, ) = get_dsm_delay_status( data_last_lag_seen, data_max_delay_allowed, data_last_ingest, data_last_time_seen, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"' ) # Initialize threshold_scores for DSM (DSM doesn't use dynamic thresholds, so this is always empty) threshold_scores = [] # call set_dsm_status and define object_state and anomaly_reason (with hybrid scoring) ( object_state, status_message, status_message_json, anomaly_reason, ) = set_dsm_status( logger, request_info.server_rest_uri, request_info.system_authtoken, tenant_id, record, isOutlier, isAnomaly, isFuture, isFutureMsg, isUnderMonitoring, isUnderMonitoringMsg, isUnderDcountHost, isUnderDcountHostMsg, object_logical_group_dict, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage, disruption_queue_collection, disruption_queue_record, source_handler="rest_handler", monitoring_anomaly_reason=monitoring_anomaly_reason, score=score, score_outliers=score_outliers, vtenant_account=vtenant_conf, ) logger.debug( f'instance_id={self.instance_id}, set_dsm_status, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"' ) # insert our main fields record["object_state"] = object_state record["status_message"] = " | ".join(status_message) record["status_message_json"] = status_message_json record["anomaly_reason"] = "|".join(anomaly_reason) # generate charts resources for this entity if load_charts_resources: try: charts_resources = generate_charts_resources( tenant_id=tenant_id, component="dsm", object=object_value, keyid=key_value, anomaly_reason=anomaly_reason, vtenant_conf=vtenant_conf, service=service ) record["charts_resources"] = charts_resources except Exception as e: logger.debug(f"Failed to generate charts for DSM entity {key_value}: {str(e)}") record["charts_resources"] = [] # sampling status sampling_anomaly_status(record) # future tolerance try: record["future_tolerance"] = int( round(merged_future_tolerance, 0) ) except: record["future_tolerance"] = -600 # convert data_last_time_seen to last_time from epoch last_time = convert_epoch_to_datetime(data_last_time_seen) record["last_time"] = last_time # convert data_last_ingest to last_ingest from epoch last_ingest = convert_epoch_to_datetime(data_last_ingest) record["last_ingest"] = last_ingest # convert data_last_time_seen_idx to last_time_idx from epoch last_time_idx = convert_epoch_to_datetime(data_last_time_seen) record["last_time_idx"] = last_time_idx # get and convert latest_flip_time from epoch latest_flip_time_human = record.get("latest_flip_time", 0) try: latest_flip_time_human = float(latest_flip_time_human) except: latest_flip_time_human = 0 record["latest_flip_time_human"] = convert_epoch_to_datetime( latest_flip_time_human ) # set lag_summary field record["lag_summary"] = set_feeds_lag_summary(record, component) # get and set thresholds_duration ( data_max_delay_allowed_duration, data_max_lag_allowed_duration, ) = set_feeds_thresholds_duration(record) record["data_max_delay_allowed_duration"] = ( data_max_delay_allowed_duration ) record["data_max_lag_allowed_duration"] = ( data_max_lag_allowed_duration ) # Documentation note docs_ref_lookup( docs_is_global, docs_note_global, docs_link_global, object_value, docs_collection_members_list, docs_collection_members_dict, record, ) # sla_timer get_sla_timer(record, sla_classes, sla_default_class) # # splk-dhm # elif component == "dhm": # first check blocklist if ( datagen_collection_blocklist_not_regex_dict or datagen_collection_blocklist_regex_dict ): append_record = apply_blocklist( record, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) if append_record: # refresh data_last_lag_seen in the record try: record["data_last_lag_seen"] = time.time() - float( record.get("data_last_time_seen", 0) ) except: record["data_last_lag_seen"] = 0 # get splk_dhm_st_summary splk_dhm_st_summary = record.get("splk_dhm_st_summary", None) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", splk_dhm_st_summary="{splk_dhm_st_summary}"' ) # get outliers and data sampling try: isOutlier = int(record.get("isOutlier", 0)) except: isOutlier = 0 try: OutliersDisabled = int(record.get("OutliersDisabled", 0)) except: OutliersDisabled = 0 try: isAnomaly = int(record.get("isAnomaly", 0)) except: isAnomaly = 0 logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"' ) # get future_tolerance future_tolerance = record.get("future_tolerance", 0) try: future_tolerance = float(future_tolerance) except: future_tolerance = 0 # get actual primary KPI values data_last_ingestion_lag_seen = record.get( "data_last_ingestion_lag_seen", 0 ) if data_last_ingestion_lag_seen == "": data_last_ingestion_lag_seen = 0 try: data_last_ingestion_lag_seen = float( data_last_ingestion_lag_seen ) except: data_last_ingestion_lag_seen = 0 data_last_lag_seen = record.get("data_last_lag_seen", 0) # get per entity thresholds data_max_lag_allowed = float( record.get("data_max_lag_allowed", 0) ) data_max_delay_allowed = float( record.get("data_max_delay_allowed", 0) ) # get monitoring time policy and rules (new fields) monitoring_time_policy = record.get("monitoring_time_policy", None) # if unset yet, use the tenant level and add to the record if monitoring_time_policy is None or len(monitoring_time_policy) == 0: monitoring_time_policy = default_monitoring_time_policy record["monitoring_time_policy"] = default_monitoring_time_policy monitoring_time_rules = record.get("monitoring_time_rules", None) # Get logical group information # get logical group information: object_group_key object_group_key = record.get("object_group_key", "") # from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function object_logical_group_dict = logical_coll_dict.get( object_group_key, {} ) # get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime) data_last_ingest = record.get("data_last_ingest", 0) try: data_last_ingest = float(data_last_ingest) except: pass data_last_time_seen = record.get("data_last_time_seen", 0) if data_last_time_seen == "": data_last_time_seen = 0 try: data_last_time_seen = float(data_last_time_seen) except: data_last_time_seen = 0 data_last_time_seen_idx = record.get( "data_last_time_seen_idx", 0 ) try: data_last_time_seen_idx = float(data_last_time_seen_idx) except: pass # call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg ( isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg, ) = get_monitoring_time_status( monitoring_time_policy, monitoring_time_rules, ) # call get_outliers_status and define isOutlier (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 920-923) isOutlier = get_outliers_status( isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"' ) # call get_future_status and define isFuture ( isFuture, isFutureMsg, merged_future_tolerance, ) = get_future_status( future_tolerance, system_future_tolerance, data_last_lag_seen, data_last_ingestion_lag_seen, data_last_time_seen, data_last_ingest, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"' ) # call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage ( isUnderLatencyAlert, isUnderLatencyMessage, ) = get_dsm_latency_status( data_last_ingestion_lag_seen, data_max_lag_allowed, data_last_ingest, data_last_time_seen, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"' ) # call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage ( isUnderDelayAlert, isUnderDelayMessage, ) = get_dsm_delay_status( data_last_lag_seen, data_max_delay_allowed, data_last_ingest, data_last_time_seen, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"' ) # Initialize threshold_scores for DHM (DHM doesn't use dynamic thresholds, so this is always empty) threshold_scores = [] # call set_dhm_status and define object_state and anomaly_reason (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 921-933) ( object_state, status_message, status_message_json, anomaly_reason, splk_dhm_alerting_policy, ) = set_dhm_status( logger, request_info.server_rest_uri, request_info.system_authtoken, tenant_id, record, isOutlier, isFuture, isFutureMsg, isUnderMonitoring, isUnderMonitoringMsg, object_logical_group_dict, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage, default_splk_dhm_alerting_policy, disruption_queue_collection, disruption_queue_record, source_handler="rest_handler", monitoring_anomaly_reason=monitoring_anomaly_reason, score=score, score_outliers=score_outliers, vtenant_account=vtenant_conf, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"' ) # insert our main fields record["object_state"] = object_state record["status_message"] = " | ".join(status_message) record["status_message_json"] = status_message_json record["anomaly_reason"] = "|".join(anomaly_reason) # generate charts resources for this entity if load_charts_resources: try: charts_resources = generate_charts_resources( tenant_id=tenant_id, component="dhm", object=object_value, keyid=key_value, anomaly_reason=anomaly_reason, vtenant_conf=vtenant_conf, service=service ) record["charts_resources"] = charts_resources except Exception as e: logger.debug(f"Failed to generate charts for DHM entity {key_value}: {str(e)}") record["charts_resources"] = [] # future tolerance try: record["future_tolerance"] = int( round(merged_future_tolerance, 0) ) except: record["future_tolerance"] = -600 # specific for dhm record["splk_dhm_alerting_policy"] = splk_dhm_alerting_policy # convert data_last_time_seen to last_time from epoch last_time = convert_epoch_to_datetime(data_last_time_seen) record["last_time"] = last_time # convert data_last_ingest to last_ingest from epoch last_ingest = convert_epoch_to_datetime(data_last_ingest) record["last_ingest"] = last_ingest # convert data_last_time_seen_idx to last_time_idx from epoch last_time_idx = convert_epoch_to_datetime(data_last_time_seen) record["last_time_idx"] = last_time_idx # get and convert latest_flip_time from epoch latest_flip_time_human = record.get("latest_flip_time", 0) try: latest_flip_time_human = float(latest_flip_time_human) except: latest_flip_time_human = 0 record["latest_flip_time_human"] = convert_epoch_to_datetime( latest_flip_time_human ) # set lag_summary field record["lag_summary"] = set_feeds_lag_summary(record, component) # get and set thresholds_duration ( data_max_delay_allowed_duration, data_max_lag_allowed_duration, ) = set_feeds_thresholds_duration(record) record["data_max_delay_allowed_duration"] = ( data_max_delay_allowed_duration ) record["data_max_lag_allowed_duration"] = ( data_max_lag_allowed_duration ) # sourcetype summary record["sourcetype_summary"] = record.get( f"splk_dhm_st_summary_{mode_view}", "{}" ) del record["splk_dhm_st_summary_minimal"] del record["splk_dhm_st_summary_compact"] # splk_dhm_st_summary_full is needed for UI expansion purposes # sla_timer get_sla_timer(record, sla_classes, sla_default_class) # # splk-mhm # elif component == "mhm": # first check blocklist if ( datagen_collection_blocklist_not_regex_dict or datagen_collection_blocklist_regex_dict ): append_record = apply_blocklist( record, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) if append_record: # refresh data_last_lag_seen in the record try: record["last_lag_seen"] = time.time() - float( record.get("metric_last_time_seen", 0) ) except: record["last_lag_seen"] = 0 # get metric_details metric_details = record.get("metric_details", None) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", metric_details="{metric_details}"' ) # metric_details summary replacements record["metric_details"] = record.get( f"metric_details_{mode_view}", "{}" ) # remove metric_details_* for optimization purposes del record["metric_details_minimal"] del record["metric_details_compact"] # metric_details_full cannot be removed for UI expansion purposes # Get logical group information # get logical group information: object_group_key object_group_key = record.get("object_group_key", "") # from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function object_logical_group_dict = logical_coll_dict.get( object_group_key, {} ) # get metric_last_time_seen (epochtime) metric_last_time_seen = record.get("metric_last_time_seen", 0) try: metric_last_time_seen = float(metric_last_time_seen) except: pass # call get_future_metrics_status and define isFuture isFuture, isFutureMsg = get_future_metrics_status( system_future_tolerance, metric_last_time_seen, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", system_future_tolerance="{system_future_tolerance}", metric_last_time_seen="{metric_last_time_seen}", isFutureMsg="{isFutureMsg}"' ) # call set_mhm_status and define object_state and anomaly_reason (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 921-933) ( object_state, status_message, status_message_json, anomaly_reason, ) = set_mhm_status( logger, request_info.server_rest_uri, request_info.system_authtoken, tenant_id, record, metric_details, isFuture, isFutureMsg, object_logical_group_dict, disruption_queue_collection, disruption_queue_record, source_handler="rest_handler", score=score, score_outliers=score_outliers, vtenant_account=vtenant_conf, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"' ) # insert our main fields record["object_state"] = object_state record["status_message"] = " | ".join(status_message) record["status_message_json"] = status_message_json record["anomaly_reason"] = "|".join(anomaly_reason) # generate charts resources for this entity if load_charts_resources: try: charts_resources = generate_charts_resources( tenant_id=tenant_id, component="mhm", object=object_value, keyid=key_value, anomaly_reason=anomaly_reason, vtenant_conf=vtenant_conf, service=service ) record["charts_resources"] = charts_resources except Exception as e: logger.debug(f"Failed to generate charts for MHM entity {key_value}: {str(e)}") record["charts_resources"] = [] # convert metric_last_time_seen to last_time from epoch last_time = convert_epoch_to_datetime(metric_last_time_seen) record["last_time"] = last_time # get and convert latest_flip_time from epoch latest_flip_time_human = record.get("latest_flip_time", 0) try: latest_flip_time_human = float(latest_flip_time_human) except: latest_flip_time_human = 0 record["latest_flip_time_human"] = convert_epoch_to_datetime( latest_flip_time_human ) # set lag_summary field record["lag_summary"] = set_feeds_lag_summary(record, component) # sla_timer get_sla_timer(record, sla_classes, sla_default_class) # # splk-flx # # get record fields depending on the component elif component == "flx": # first check blocklist if ( datagen_collection_blocklist_not_regex_dict or datagen_collection_blocklist_regex_dict ): append_record = apply_blocklist( record, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) if append_record: # get outliers try: isOutlier = int(record.get("isOutlier", 0)) except: isOutlier = 0 try: OutliersDisabled = int(record.get("OutliersDisabled", 0)) except: OutliersDisabled = 0 logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"' ) # get monitoring time policy and rules (new fields) monitoring_time_policy = record.get("monitoring_time_policy", None) # if unset yet, use the tenant level and add to the record if monitoring_time_policy is None or len(monitoring_time_policy) == 0: monitoring_time_policy = default_monitoring_time_policy record["monitoring_time_policy"] = default_monitoring_time_policy monitoring_time_rules = record.get("monitoring_time_rules", None) # Get logical group information # get logical group information: object_group_key object_group_key = record.get("object_group_key", "") # from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function object_logical_group_dict = logical_coll_dict.get( object_group_key, {} ) # call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg ( isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg, ) = get_monitoring_time_status( monitoring_time_policy, monitoring_time_rules, ) # call get_outliers_status and define isOutlier (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 920-923) isOutlier = get_outliers_status( isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"' ) # Aggregate tracker-keyed JSON fields for concurrent trackers support (same logic as decision maker) # Aggregate metrics: merge all trackers' metrics into a single dict # This must be done BEFORE flx_check_dynamic_thresholds which expects aggregated metrics if "metrics" in record: try: metrics_value = record.get("metrics") if metrics_value: metrics_by_tracker = None # Parse if it's a JSON string if isinstance(metrics_value, str): try: metrics_by_tracker = json.loads(metrics_value) except (json.JSONDecodeError, TypeError): # If parsing fails, might be old format, skip aggregation pass elif isinstance(metrics_value, dict): metrics_by_tracker = metrics_value if metrics_by_tracker and isinstance(metrics_by_tracker, dict): # Check if it's tracker-keyed format (values are dicts) or old format (direct metrics dict) aggregated_metrics = {} is_tracker_keyed = False for key, value in metrics_by_tracker.items(): if isinstance(value, dict): # Check if value looks like metrics (has numeric/string values) or tracker data # If all values in the nested dict are simple types, it's likely metrics if all(isinstance(v, (int, float, str, bool)) or v is None for v in value.values()): # This is tracker-keyed format, merge all trackers' metrics aggregated_metrics.update(value) is_tracker_keyed = True else: # Nested structure, might be tracker data is_tracker_keyed = True aggregated_metrics.update(value) else: # Simple value, old format break if is_tracker_keyed: # Remove internal "status" field from aggregated metrics (not a user metric) if "status" in aggregated_metrics: del aggregated_metrics["status"] # Update record with aggregated metrics as dict (for backward compatibility) # Handle empty aggregated_metrics case (e.g., {"tracker1": {}}) record["metrics"] = aggregated_metrics elif not is_tracker_keyed: # Old format (already aggregated flat dict), remove status field if isinstance(metrics_value, str): try: old_metrics = json.loads(metrics_value) if isinstance(old_metrics, dict): if "status" in old_metrics: old_metrics = old_metrics.copy() del old_metrics["status"] record["metrics"] = old_metrics else: record["metrics"] = {} except: record["metrics"] = {} else: # metrics_by_tracker is already the parsed dict if isinstance(metrics_by_tracker, dict): if "status" in metrics_by_tracker: metrics_by_tracker = metrics_by_tracker.copy() del metrics_by_tracker["status"] record["metrics"] = metrics_by_tracker else: record["metrics"] = {} except Exception as e: logger.error( f'instance_id={self.instance_id}, failed to aggregate metrics for object="{object_value}", ' f'exception="{str(e)}"' ) # flx thresholds lookup flx_thresholds_lookup( object_value, key_value, record, thresholds_collection_dict, ) logger.debug( f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"' ) # flx check dynamic thresholds threshold_alert, threshold_messages, threshold_scores = ( flx_check_dynamic_thresholds( logger, record.get("dynamic_thresholds", {}), record.get("metrics", {}), ) ) logger.debug( f'instance_id={self.instance_id}, result function flx_check_dynamic_thresholds object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"' ) # flx drilldown searches lookup try: flx_drilldown_searches_lookup( tenant_id, record.get("tracker_name", ""), record.get("account", "local"), record, drilldown_searches_collection_dict, ) logger.debug( f'instance_id={self.instance_id}, drilldown_search="{record.get("drilldown_search", "")}", drilldown_search_earliest="{record.get("drilldown_search_earliest", "")}", drilldown_search_latest="{record.get("drilldown_search_latest", "")}", drilldown_searches="{json.dumps(record.get("drilldown_searches", []), indent=2)}"' ) except Exception as e: logger.error(f"instance_id={self.instance_id}, Error in flx_drilldown_searches_lookup: {str(e)}") # flx default metrics lookup try: flx_default_metrics_lookup( tenant_id, record.get("tracker_name", ""), record, default_metrics_collection_dict, ) logger.debug( f'instance_id={self.instance_id}, default_metric="{record.get("default_metric", "")}"' ) except Exception as e: logger.error(f"instance_id={self.instance_id}, Error in flx_default_metrics_lookup: {str(e)}") # Determine number of trackers to decide if we need prefix num_trackers = 1 if "tracker_name" in record: try: tracker_name_value = record.get("tracker_name") if tracker_name_value: if isinstance(tracker_name_value, str): try: tracker_names = json.loads(tracker_name_value) if isinstance(tracker_names, list): num_trackers = len(tracker_names) except (json.JSONDecodeError, TypeError): # If parsing fails, might be comma-separated string if "," in tracker_name_value: num_trackers = len([t.strip() for t in tracker_name_value.split(",")]) elif isinstance(tracker_name_value, list): num_trackers = len(tracker_name_value) except Exception: pass # Aggregate status_description: concatenate all trackers' descriptions if "status_description" in record: try: status_desc_value = record.get("status_description") if status_desc_value: status_desc_by_tracker = None if isinstance(status_desc_value, str): try: status_desc_by_tracker = json.loads(status_desc_value) except (json.JSONDecodeError, TypeError): # If parsing fails, might be old format string, keep as-is pass elif isinstance(status_desc_value, dict): status_desc_by_tracker = status_desc_value if status_desc_by_tracker and isinstance(status_desc_by_tracker, dict): # Check if it's tracker-keyed format (all values are strings) or old format status_descriptions = [] is_tracker_keyed = False for tracker_name, desc in status_desc_by_tracker.items(): if isinstance(desc, str): # Tracker-keyed format if desc: # Only add prefix if multiple trackers if num_trackers > 1: status_descriptions.append(f"{tracker_name}: {desc}") else: status_descriptions.append(desc) is_tracker_keyed = True else: # Not tracker-keyed format break if is_tracker_keyed and status_descriptions: # Update record with aggregated status_description record["status_description"] = " | ".join(status_descriptions) except Exception as e: logger.error( f'instance_id={self.instance_id}, failed to aggregate status_description for object="{object_value}", ' f'exception="{str(e)}"' ) # Aggregate status_description_short: concatenate all trackers' descriptions if "status_description_short" in record: try: status_desc_short_value = record.get("status_description_short") if status_desc_short_value: status_desc_short_by_tracker = None if isinstance(status_desc_short_value, str): try: status_desc_short_by_tracker = json.loads(status_desc_short_value) except (json.JSONDecodeError, TypeError): # If parsing fails, might be old format string, keep as-is pass elif isinstance(status_desc_short_value, dict): status_desc_short_by_tracker = status_desc_short_value if status_desc_short_by_tracker and isinstance(status_desc_short_by_tracker, dict): # Check if it's tracker-keyed format status_descriptions_short = [] is_tracker_keyed = False for tracker_name, desc in status_desc_short_by_tracker.items(): if isinstance(desc, str): # Tracker-keyed format if desc: # Only add prefix if multiple trackers if num_trackers > 1: status_descriptions_short.append(f"{tracker_name}: {desc}") else: status_descriptions_short.append(desc) is_tracker_keyed = True else: # Not tracker-keyed format break if is_tracker_keyed and status_descriptions_short: # Update record with aggregated status_description_short record["status_description_short"] = " | ".join(status_descriptions_short) except Exception as e: logger.error( f'instance_id={self.instance_id}, failed to aggregate status_description_short for object="{object_value}", ' f'exception="{str(e)}"' ) # Aggregate tracker_name: convert JSON array to comma-separated string for display if "tracker_name" in record: try: tracker_name_value = record.get("tracker_name") if tracker_name_value: if isinstance(tracker_name_value, str): try: tracker_names = json.loads(tracker_name_value) if isinstance(tracker_names, list): # Convert array to comma-separated string record["tracker_name"] = ", ".join(tracker_names) except (json.JSONDecodeError, TypeError): # If parsing fails, might be old format string, keep as-is pass elif isinstance(tracker_name_value, list): # Already a list, convert to comma-separated string record["tracker_name"] = ", ".join(tracker_name_value) except Exception as e: logger.error( f'instance_id={self.instance_id}, failed to aggregate tracker_name for object="{object_value}", ' f'exception="{str(e)}"' ) # Aggregate object_description: concatenate all trackers' descriptions if "object_description" in record: try: object_desc_value = record.get("object_description") if object_desc_value: object_desc_by_tracker = None if isinstance(object_desc_value, str): try: object_desc_by_tracker = json.loads(object_desc_value) except (json.JSONDecodeError, TypeError): # If parsing fails, might be old format string, keep as-is pass elif isinstance(object_desc_value, dict): object_desc_by_tracker = object_desc_value if object_desc_by_tracker and isinstance(object_desc_by_tracker, dict): # Check if it's tracker-keyed format (all values are strings) or old format object_descriptions = [] is_tracker_keyed = False for tracker_name, desc in object_desc_by_tracker.items(): if isinstance(desc, str): # Tracker-keyed format if desc: # Only add prefix if multiple trackers if num_trackers > 1: object_descriptions.append(f"{tracker_name}: {desc}") else: object_descriptions.append(desc) is_tracker_keyed = True else: # Not tracker-keyed format break if is_tracker_keyed and object_descriptions: # Update record with aggregated object_description record["object_description"] = " | ".join(object_descriptions) except Exception as e: logger.error( f'instance_id={self.instance_id}, failed to aggregate object_description for object="{object_value}", ' f'exception="{str(e)}"' ) # call set_flx_status and define object_state and anomaly_reason (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 921-933) ( object_state, status_message, status_message_json, anomaly_reason, ) = set_flx_status( logger, request_info.server_rest_uri, request_info.system_authtoken, tenant_id, record, isOutlier, isUnderMonitoring, isUnderMonitoringMsg, object_logical_group_dict, threshold_alert, threshold_messages, disruption_queue_collection, disruption_queue_record, source_handler="rest_handler", monitoring_anomaly_reason=monitoring_anomaly_reason, score=score, score_outliers=score_outliers, threshold_scores=threshold_scores, vtenant_account=vtenant_conf, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"' ) # insert our main fields record["object_state"] = object_state record["status_message"] = " | ".join(status_message) record["status_message_json"] = status_message_json record["anomaly_reason"] = "|".join(anomaly_reason) # generate charts resources for this entity if load_charts_resources: try: charts_resources = generate_charts_resources( tenant_id=tenant_id, component="flx", object=object_value, keyid=key_value, anomaly_reason=anomaly_reason, vtenant_conf=vtenant_conf, service=service ) record["charts_resources"] = charts_resources except Exception as e: logger.debug(f"Failed to generate charts for FLX entity {key_value}: {str(e)}") record["charts_resources"] = [] # get and convert latest_flip_time from epoch latest_flip_time_human = record.get("latest_flip_time", 0) try: latest_flip_time_human = float(latest_flip_time_human) except: latest_flip_time_human = 0 record["latest_flip_time_human"] = convert_epoch_to_datetime( latest_flip_time_human ) # sla_timer get_sla_timer(record, sla_classes, sla_default_class) # # splk-fqm # # get record fields depending on the component elif component == "fqm": # first check blocklist if ( datagen_collection_blocklist_not_regex_dict or datagen_collection_blocklist_regex_dict ): append_record = apply_blocklist( record, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) if append_record: # get outliers try: isOutlier = int(record.get("isOutlier", 0)) except: isOutlier = 0 try: OutliersDisabled = int(record.get("OutliersDisabled", 0)) except: OutliersDisabled = 0 logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"' ) # get monitoring time policy and rules (new fields) monitoring_time_policy = record.get("monitoring_time_policy", None) # if unset yet, use the tenant level and add to the record if monitoring_time_policy is None or len(monitoring_time_policy) == 0: monitoring_time_policy = default_monitoring_time_policy record["monitoring_time_policy"] = default_monitoring_time_policy monitoring_time_rules = record.get("monitoring_time_rules", None) # Get logical group information # get logical group information: object_group_key object_group_key = record.get("object_group_key", "") # from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function object_logical_group_dict = logical_coll_dict.get( object_group_key, {} ) # call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg ( isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg, ) = get_monitoring_time_status( monitoring_time_policy, monitoring_time_rules, ) # call get_outliers_status and define isOutlier (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 920-923) isOutlier = get_outliers_status( isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"' ) # fqm thresholds lookup fqm_thresholds_lookup( object_value, key_value, record, thresholds_collection_dict, ) logger.debug( f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"' ) # fqm check dynamic thresholds threshold_alert, threshold_messages, threshold_scores = ( fqm_check_dynamic_thresholds( logger, record.get("dynamic_thresholds", {}), record.get("metrics", {}), ) ) logger.debug( f'instance_id={self.instance_id}, result function fqm_check_dynamic_thresholds object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"' ) # call set_fqm_status and define object_state and anomaly_reason (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 921-933) ( object_state, status_message, status_message_json, anomaly_reason, ) = set_fqm_status( logger, request_info.server_rest_uri, request_info.system_authtoken, tenant_id, record, isOutlier, isUnderMonitoring, isUnderMonitoringMsg, object_logical_group_dict, threshold_alert, threshold_messages, disruption_queue_collection, disruption_queue_record, source_handler="rest_handler", monitoring_anomaly_reason=monitoring_anomaly_reason, score=score, score_outliers=score_outliers, threshold_scores=threshold_scores, vtenant_account=vtenant_conf, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"' ) # insert our main fields record["object_state"] = object_state record["status_message"] = " | ".join(status_message) record["status_message_json"] = status_message_json record["anomaly_reason"] = "|".join(anomaly_reason) # generate charts resources for this entity if load_charts_resources: try: charts_resources = generate_charts_resources( tenant_id=tenant_id, component="fqm", object=object_value, keyid=key_value, anomaly_reason=anomaly_reason, vtenant_conf=vtenant_conf, service=service ) record["charts_resources"] = charts_resources except Exception as e: logger.debug(f"Failed to generate charts for FQM entity {key_value}: {str(e)}") record["charts_resources"] = [] # custom breakby fields support: # 1 - try to load the content of fields_quality_summary (JSON as string) # 2 - iterate over the JSON and look for fields metadata.* except the default metadata fields (datamodel, nodename, index, sourcetype) # 3 - if one or more additional metadata fields are found, add them to the record as metadata_ (instead of metadata.) if "fields_quality_summary" in record: try: fields_quality_summary = json.loads(record["fields_quality_summary"]) for field in fields_quality_summary: if field.startswith("metadata."): if field not in ["metadata.datamodel", "metadata.nodename", "metadata.index", "metadata.sourcetype"]: newfield_name = field.replace("metadata.", "metadata_") record[f"{newfield_name}"] = fields_quality_summary[field] except: pass # get and convert latest_flip_time from epoch latest_flip_time_human = record.get("latest_flip_time", 0) try: latest_flip_time_human = float(latest_flip_time_human) except: latest_flip_time_human = 0 record["latest_flip_time_human"] = convert_epoch_to_datetime( latest_flip_time_human ) # sla_timer get_sla_timer(record, sla_classes, sla_default_class) # # splk-wlk # # get record fields depending on the component elif component == "wlk": # first check blocklist if ( datagen_collection_blocklist_not_regex_dict or datagen_collection_blocklist_regex_dict ): append_record = apply_blocklist( record, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) if append_record: # set overgroup, if not existing, overgroup is the value of group if "overgroup" not in record: record["overgroup"] = record.get("group") # lookup app enablement wlk_disabled_apps_lookup( record.get("app"), apps_enablement_collection_keys, apps_enablement_collection_dict, record, ) # lookup versioning wlk_versioning_lookup( key_value, versioning_collection_keys, versioning_collection_dict, record, ) # lookup orphan wlk_orphan_lookup( key_value, orphan_collection_keys, orphan_collection_dict, record, ) # Only process if needed if record.get("app_is_enabled") == "False": append_record = False else: # if mode_view is full, replace metrics with metrics_extended and remove metrics_extended if mode_view == "full": record["metrics"] = record.get("metrics_extended", "{}") if "metrics_extended" in record: del record["metrics_extended"] # get outliers try: isOutlier = int(record.get("isOutlier", 0)) except: isOutlier = 0 try: OutliersDisabled = int( record.get("OutliersDisabled", 0) ) except: OutliersDisabled = 0 logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"' ) # get monitoring time policy and rules (new fields) monitoring_time_policy = record.get("monitoring_time_policy", None) # if unset yet, use the tenant level and add to the record if monitoring_time_policy is None or len(monitoring_time_policy) == 0: monitoring_time_policy = default_monitoring_time_policy record["monitoring_time_policy"] = default_monitoring_time_policy monitoring_time_rules = record.get("monitoring_time_rules", None) # call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg # Falls back to legacy fields if new fields are not set ( isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg, ) = get_monitoring_time_status( monitoring_time_policy, monitoring_time_rules, ) # call get_outliers_status and define isOutlier (with hybrid scoring) # Note: score and score_outliers are already extracted from scores_dict above (lines 921-933) isOutlier = get_outliers_status( isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"' ) # call set_wlk_status and define object_state and anomaly_reason (with hybrid scoring) ( object_state, status_message, status_message_json, anomaly_reason, ) = set_wlk_status( logger, request_info.server_rest_uri, request_info.system_authtoken, tenant_id, record, isOutlier, isUnderMonitoring, isUnderMonitoringMsg, disruption_queue_collection, disruption_queue_record, source_handler="rest_handler", monitoring_anomaly_reason=monitoring_anomaly_reason, score=score, score_outliers=score_outliers, vtenant_account=vtenant_conf, ) logger.debug( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"' ) # insert our main fields record["object_state"] = object_state record["status_message"] = " | ".join(status_message) record["status_message_json"] = status_message_json record["anomaly_reason"] = "|".join(anomaly_reason) # generate charts resources for this entity if load_charts_resources: try: charts_resources = generate_charts_resources( tenant_id=tenant_id, component="wlk", object=object_value, keyid=key_value, anomaly_reason=anomaly_reason, vtenant_conf=vtenant_conf, service=service ) record["charts_resources"] = charts_resources except Exception as e: logger.debug(f"Failed to generate charts for WLK entity {key_value}: {str(e)}") record["charts_resources"] = [] # get and convert latest_flip_time from epoch latest_flip_time_human = record.get("latest_flip_time", 0) try: latest_flip_time_human = float(latest_flip_time_human) except: latest_flip_time_human = 0 record["latest_flip_time_human"] = ( convert_epoch_to_datetime(latest_flip_time_human) ) # convert last_time_seen from epoch last_seen = convert_epoch_to_datetime( record.get("last_seen", 0) ) record["last_seen_human"] = last_seen # sla_timer get_sla_timer(record, sla_classes, sla_default_class) if append_record: # # if we do not have a value for object_state or object_state is empty, define to red # if not record.get("object_state", None): record["object_state"] = "red" # # state icon code # record["state_icon_code"] = define_state_icon_code(record) # # End, add to the processed_records list # processed_records.append(record) # log debug only logger.debug(f'instance_id={self.instance_id}, record="{json.dumps(record, indent=2)}"') # # End per component processing # except Exception as e: logger.error( f'instance_id={self.instance_id}, tenant_id="{tenant_id}", component="{component}", Error processing record, record="{json.dumps(record, indent=2)}", exception="{str(e)}"' ) continue # Proceed with next record try: logger.info( f'instance_id={self.instance_id}, collection_name="{data_collection_name}", page="{page}", size="{size}", collection_count="{total_record_count}", last_page="{last_page}"' ) filtered_records = filter_records(processed_records, query_parameters_json) """ for dev debug only if len(filtered_records) > 0: for record in filtered_records[:10]: logger.debug(f'record="{json.dumps(record, indent=2)}"') else: logger.debug(f"no results found") """ # log info logger.info( f'instance_id="{self.instance_id}", trackme_rest_handler_component_user has terminated, run_time="{round((time.time() - start), 3)}"' ) if pagination_mode == "remote": return { "payload": { "last_page": last_page, "data": filtered_records, }, "status": 200, } elif pagination_mode == "local": return { "payload": filtered_records, "status": 200, } except Exception as e: response = { "action": "failure", "response": f'an exception was encountered, exception="{str(e)}"', } logger.error(f"instance_id={self.instance_id}, {json.dumps(response)}") return {"payload": response, "status": 500} # Get the component data with pagination and progressive load capabilities def post_load_component_data_full(self, request_info, **kwargs): describe = False try: resp_dict = json.loads(str(request_info.raw_args["payload"])) except Exception as e: resp_dict = None if resp_dict is not None: try: describe = resp_dict["describe"] if describe in ("true", "True"): describe = True except Exception as e: describe = False if not describe: # tenant_id try: tenant_id = resp_dict["tenant_id"] except Exception as e: return { "payload": {"error": "tenant_id is required"}, "status": 500, } # component try: component = resp_dict["component"] if component not in ( "dsm", "dhm", "mhm", "flx", "fqm", "wlk", ): return { "payload": {"error": "component is invalid"}, "status": 500, } except Exception as e: return { "payload": {"error": "component is required"}, "status": 500, } else: describe = True # if describe is requested, show the usage if describe: response = { "describe": "This endpoint retrieves and returns the full component data with pagination and multithreading, it calls the load_component_data endpoint accordingly, it requires a POST call using data and the following options:", "resource_desc": "Retrieve the full component data with pagination and multithreading", "resource_spl_example": "| trackme url=\"/services/trackme/v2/component/load_component_data_full\" mode=\"post\" body=\"{'tenant_id': 'mytenant', 'component': 'flx'}\"", "options": [ { "tenant_id": "tenant identifier", "component": "component identifier, valid options are: flx, dsm, dhm, mhm, wlk, fqm", } ], } return {"payload": response, "status": 200} # set loglevel loglevel = trackme_getloglevel( request_info.system_authtoken, request_info.server_rest_port ) logger.setLevel(loglevel) # performance counter start = time.time() params = { "tenant_id": tenant_id, "component": component, "page": 1, "size": 0, } # Define an header for requests authenticated communications with splunkd header = { "Authorization": f"Splunk {request_info.system_authtoken}", "Content-Type": "application/json", } # Add the vtenant account url = f"{request_info.server_rest_uri}/services/trackme/v2/component/load_component_data" # results_records list results_records = [] # Proceed try: response = requests.get( url, headers=header, params=params, verify=False, timeout=600, ) if response.status_code not in (200, 201, 204): msg = f'get component has failed, response.status_code="{response.status_code}", response.text="{response.text}"' raise Exception(msg) else: response_json = response.json() last_page = response_json.get("last_page", 1) data = response_json.get("data", []) # add the data to the data_records for record in data: results_records.append(record) logger.info( f"retrieved page 1 with {len(data)} records, last_page={last_page}" ) except Exception as e: msg = f'get component has failed, exception="{str(e)}"' logger.error(msg) return {"payload": {"response": msg}, "status": 500} # run_time run_time = round((time.time() - start), 3) # return the response logger.info( f'context="perf", no_records="{len(results_records)}", run_time="{run_time}", tenant_id="{tenant_id}", component="{component}"' ) return { "payload": { "data": results_records, "entities": len(results_records), "run_time": run_time, }, "status": 200, } def get_chart_labels_and_descriptions(): """ Function to get chart labels and descriptions mapping Returns: Dictionary mapping chart types to their labels, descriptions, and chart types """ return { "latency": { "label": "Event Latency", "description": "Event latency over time showing data ingestion delays", "chart_type": "line" }, "delay": { "label": "Event Delay", "description": "Event delay over time showing time between event occurrence and ingestion", "chart_type": "line" }, "volume": { "label": "Event Volume", "description": "Event volume over time showing the number of events", "chart_type": "line" }, "hosts_dcount": { "label": "Hosts Count", "description": "Distinct host count over time", "chart_type": "line" }, "data_sampling_anomaly": { "label": "Data Sampling Anomaly", "description": "Data sampling model match percentage over time", "chart_type": "bar" }, "flx_status": { "label": "FLX Status", "description": "FLX entity status over time", "chart_type": "line" }, "incidents_events": { "label": "Incident Events", "description": "Stateful alert incidents timeline", "chart_type": "bar" }, "flipping_events": { "label": "State Flipping Events", "description": "Entity state changes over time", "chart_type": "bar" }, "state_events": { "label": "State Events", "description": "Entity state distribution over time", "chart_type": "bar" } } def generate_charts_resources(tenant_id, component, object, keyid, anomaly_reason, vtenant_conf, service): """ Function to generate chart resources for an entity based on component type and anomaly reasons Args: tenant_id: The tenant ID component: The component type (dsm, dhm, mhm, flx, fqm, wlk) object: The object name keyid: The object key ID anomaly_reason: List of anomaly reasons vtenant_conf: Virtual tenant configuration service: Splunk service object Returns: List of chart dictionaries with chart_label, chart_description, and chart_search """ charts = [] chart_labels = get_chart_labels_and_descriptions() try: # Parse anomaly_reason if it's a string if isinstance(anomaly_reason, str): anomaly_reason = anomaly_reason.split("|") if anomaly_reason else [] elif not isinstance(anomaly_reason, list): anomaly_reason = [] # Normalize anomaly_reason (remove empty strings) anomaly_reason = [reason for reason in anomaly_reason if reason and reason.strip()] # Create object_category for chart search object_category = f"splk-{component}" # Check tenant-level feature enablement try: outliers_enabled = int(vtenant_conf.get(f'mloutliers_{component}', 1)) == 1 except Exception as e: outliers_enabled = False try: sampling_enabled = int(vtenant_conf.get('sampling', 1)) == 1 except Exception as e: sampling_enabled = False # Component-specific chart logic if component in ("dsm", "dhm"): # Always include basic charts for DSM/DHM for chart_type in ["latency", "delay", "volume"]: chart_search = get_chart_search( chart_type=chart_type, tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid ) if chart_search: chart_info = chart_labels.get(chart_type, {}) charts.append({ "chart_label": chart_info.get("label", chart_type), "chart_description": chart_info.get("description", f"{chart_type} chart"), "chart_search": chart_search, "chart_type": chart_info.get("chart_type", "line") }) # DSM-specific charts if component == "dsm": # hosts_dcount chart chart_search = get_chart_search( chart_type="hosts_dcount", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid ) if chart_search: chart_info = chart_labels.get("hosts_dcount", {}) charts.append({ "chart_label": chart_info.get("label", "Hosts Count"), "chart_description": chart_info.get("description", "Distinct host count over time"), "chart_search": chart_search, "chart_type": chart_info.get("chart_type", "line") }) # data_sampling_anomaly chart (if anomaly present and sampling enabled) if "data_sampling_anomaly" in anomaly_reason and sampling_enabled: chart_search = get_chart_search( chart_type="data_sampling_anomaly", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid ) if chart_search: chart_info = chart_labels.get("data_sampling_anomaly", {}) charts.append({ "chart_label": chart_info.get("label", "Data Sampling Anomaly"), "chart_description": chart_info.get("description", "Data sampling model match percentage over time"), "chart_search": chart_search, "chart_type": chart_info.get("chart_type", "bar") }) elif component == "flx": # FLX status chart chart_search = get_chart_search( chart_type="flx_status", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid ) if chart_search: chart_info = chart_labels.get("flx_status", {}) charts.append({ "chart_label": chart_info.get("label", "FLX Status"), "chart_description": chart_info.get("description", "FLX entity status over time"), "chart_search": chart_search, "chart_type": chart_info.get("chart_type", "line") }) # Dynamic FLX metrics try: flx_metrics = flx_get_metrics_catalog_for_object_id( None, service, tenant_id, keyid, timerange_charts="24h" ) if flx_metrics: for flx_metric in flx_metrics: chart_search = get_chart_search( chart_type="flx_metric_group", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid, metric_list=[flx_metric] ) if chart_search: # Determine chart type based on metrics (bar if any metric contains "count", otherwise line) chart_type = "bar" if "count" in flx_metric.lower() else "line" charts.append({ "chart_label": f"FLX Metric: {flx_metric}", "chart_description": f"FLX metrics over time for {flx_metric}", "chart_search": chart_search, "chart_type": chart_type }) except Exception as e: logger.debug(f"Failed to get FLX metrics for {keyid}: {str(e)}") elif component == "fqm": # Dynamic FQM metrics try: fqm_metrics = fqm_get_metrics_catalog_for_object_id( None, service, tenant_id, keyid, timerange_charts="24h" ) if fqm_metrics: for fqm_metric in fqm_metrics: chart_search = get_chart_search( chart_type="fqm_metric_group", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid, metric_list=[fqm_metric] ) if chart_search: # Determine chart type based on metrics (bar if any metric contains "count", otherwise line) chart_type = "bar" if "count" in fqm_metric.lower() else "line" charts.append({ "chart_label": f"FQM Metric: {fqm_metric}", "chart_description": f"FQM metrics over time for {fqm_metric}", "chart_search": chart_search, "chart_type": chart_type }) except Exception as e: logger.debug(f"Failed to get FQM metrics for {keyid}: {str(e)}") elif component == "wlk": # Dynamic WLK metrics try: wlk_metrics = wlk_get_metrics_catalog_for_object_id( None, service, tenant_id, keyid, timerange_charts="24h" ) if wlk_metrics: for wlk_metric in wlk_metrics: chart_search = get_chart_search( chart_type="wlk_metric_group", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid, metric_list=[wlk_metric] ) if chart_search: # Determine chart type based on metrics (bar if any metric contains "count", otherwise line) chart_type = "bar" if "count" in wlk_metric.lower() else "line" charts.append({ "chart_label": f"WLK Metric: {wlk_metric}", "chart_description": f"WLK metrics over time for {wlk_metric}", "chart_search": chart_search, "chart_type": chart_type }) except Exception as e: logger.debug(f"Failed to get WLK metrics for {keyid}: {str(e)}") # ML Outliers charts (for applicable components) if component in ("dsm", "dhm", "flx", "fqm", "wlk") and outliers_enabled: logger.debug(f'handling mloutliers_detection for {component} entity {keyid}, outliers_enabled={outliers_enabled}') try: class _HelperAdapter: def __init__(self, base_logger): self._logger = base_logger def log_debug(self, message): self._logger.debug(message) def log_info(self, message): self._logger.info(message) def log_error(self, message): self._logger.error(message) helper_adapter = _HelperAdapter(logger) ml_models = get_mlmodels_from_kvstore( helper_adapter, service, tenant_id, component, object, keyid ) if ml_models: for model_id in ml_models: chart_search = get_chart_search( chart_type="ml_outliers", tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid, model_id=model_id ) if chart_search: charts.append({ "chart_label": f"ML Outliers: {model_id}", "chart_description": f"Machine learning outliers detection for model {model_id}", "chart_search": chart_search, "chart_type": "line" }) except Exception as e: logger.error(f"Failed to get ML models for {keyid}: {str(e)}") # Common charts for all components for chart_type in ["incidents_events", "flipping_events", "state_events"]: chart_search = get_chart_search( chart_type=chart_type, tenant_id=tenant_id, object_category=object_category, object=object, keyid=keyid ) if chart_search: chart_info = chart_labels.get(chart_type, {}) charts.append({ "chart_label": chart_info.get("label", chart_type), "chart_description": chart_info.get("description", f"{chart_type} chart"), "chart_search": chart_search, "chart_type": chart_info.get("chart_type", "line") }) except Exception as e: logger.error(f"Error generating charts for {component} entity {keyid}: {str(e)}") # Return empty list on error to not break the main response return [] return charts