#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Standard library imports import os import sys import time import json import hashlib # Logging imports import logging from logging.handlers import RotatingFileHandler # Networking imports import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # splunk home splunkhome = os.environ["SPLUNK_HOME"] # set logging filehandler = RotatingFileHandler( "%s/var/log/splunk/trackme_splk_outliers_tracker_helper.log" % splunkhome, mode="a", maxBytes=10000000, backupCount=1, ) formatter = logging.Formatter( "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s" ) logging.Formatter.converter = time.gmtime filehandler.setFormatter(formatter) log = logging.getLogger() # root logger - Good to get it only once. for hdlr in log.handlers[:]: # remove the existing file handlers if isinstance(hdlr, logging.FileHandler): log.removeHandler(hdlr) log.addHandler(filehandler) # set the new handler # set the log level to INFO, DEBUG as the default is ERROR log.setLevel(logging.INFO) # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # import Splunk libs from splunklib.searchcommands import ( dispatch, GeneratingCommand, Configuration, Option, validators, ) # import trackme libs from trackme_libs import ( trackme_reqinfo, trackme_vtenant_account, trackme_vtenant_component_info, trackme_register_tenant_object_summary, run_splunk_search, trackme_handler_events, trackme_idx_for_tenant, trackme_gen_state, ) # import trackme libs utils from trackme_libs_utils import remove_leading_spaces, decode_unicode # import trackme libs croniter from trackme_libs_croniter import cron_to_seconds # import trackme libs scoring from trackme_libs_scoring import trackme_scoring_gen_metrics @Configuration(distributed=False) class SplkOutliersTracker(GeneratingCommand): tenant_id = Option( doc=""" **Syntax:** **tenant_id=**** **Description:** The tenant identifier.""", require=True, default=None, ) component = Option( doc=""" **Syntax:** **component=**** **Description:** The component category.""", require=True, default=None, validate=validators.Match("component", r"^(?:dsm|dhm|flx|fqm|wlk)$"), ) object = Option( doc=""" **Syntax:** **object=**** **Description:** Optional, The value for object.""", require=False, default="*", validate=validators.Match("object", r"^.*$"), ) object_id = Option( doc=""" **Syntax:** **object_id=**** **Description:** Optional, The value for object id.""", require=False, default="*", validate=validators.Match("object_id", r"^.*$"), ) max_runtime = Option( doc=""" **Syntax:** **max_runtime=**** **Description:** Optional, The max value in seconds for the total runtime of the job, defaults to 900 (15 min) which is substracted by 120 sec of margin. Once the job reaches this, it gets terminated""", require=False, default="900", validate=validators.Match("object", r"^\d*$"), ) def _get_log_object_ref(self, object_value=None, object_id_value=None): """Helper function to get object reference for logging (includes object_id when available).""" object_id_ref = f'object_id="{object_id_value}"' if object_id_value else "" object_ref = f'object="{object_value}"' if object_value else "" if object_id_ref and object_ref: return f'{object_id_ref}, {object_ref}' elif object_id_ref: return object_id_ref elif object_ref: return object_ref else: return 'object="*"' force_run = Option( doc=""" **Syntax:** **force_run=**** **Description:** Optional, force run monitor, if set to True we will not honour the minimal time betwen two monitor execution.""", require=False, default="False", validate=validators.Match("force_run", r"^(?:True|False)$"), ) allow_auto_train = Option( doc=""" **Syntax:** **allow_auto_train=**** **Description:** Allows automated ML training if not trained since more than system wide parameter.""", require=False, default=True, ) def generate(self, **kwargs): # performance counter start = time.time() # Track execution times execution_times = [] average_execution_time = 0 # Get request info and set logging level reqinfo = trackme_reqinfo( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri ) log.setLevel(reqinfo["logging_level"]) # Get Virtual Tenant account vtenant_account = trackme_vtenant_account( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) # get vtenant component info vtenant_component_info = trackme_vtenant_component_info( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) logging.debug( f'vtenant_component_info="{json.dumps(vtenant_component_info, indent=2)}"' ) # Get the tenant indexes tenant_indexes = trackme_idx_for_tenant( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) # check schema version migration state try: schema_version = int(vtenant_component_info["schema_version"]) schema_version_upgrade_in_progress = bool( int(vtenant_component_info["schema_version_upgrade_in_progress"]) ) logging.debug( f'schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"' ) except Exception as e: schema_version = 0 schema_version_upgrade_in_progress = False logging.error( f'failed to retrieve schema_version_upgrade_in_progress=, exception="{str(e)}"' ) # Do not proceed if the schema version upgrade is in progress if schema_version_upgrade_in_progress: yield_json = { "_time": time.time(), "tenant_id": self.tenant_id, "component": self.component, "response": f'tenant_id="{self.tenant_id}", schema upgrade is currently in progress, we will wait until the process is completed before proceeding, the schema upgrade is handled by the health_tracker of the tenant and is completed once the schema_version field of the Virtual Tenants KVstore (trackme_virtual_tenants) matches TrackMe\'s version, schema_version="{schema_version}", schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"', "schema_version": schema_version, "schema_version_upgrade_in_progress": schema_version_upgrade_in_progress, } logging.info(json.dumps(yield_json, indent=2)) yield { "_time": yield_json["_time"], "_raw": yield_json, } # Default to True (ML Outliers enabled) outliers_feature_enabled = True # Define the valid components valid_components = {"dsm", "dhm", "flx", "fqm", "wlk"} # Construct the key dynamically key = f"mloutliers_{self.component}" # Check if the component is valid and handle exceptions if self.component in valid_components: try: if int(vtenant_account.get(key, 1)) == 0: outliers_feature_enabled = False except (ValueError, TypeError): outliers_feature_enabled = True if not outliers_feature_enabled or schema_version_upgrade_in_progress: if not outliers_feature_enabled: # yield and log results_dict = { "tenant_id": self.tenant_id, "action": "success", "results": "ML Anomaly Detection feature is disabled for the tenant and component, no action taken", "vtenant_account": vtenant_account, } yield {"_time": time.time(), "_raw": results_dict} else: # procees # Get app level config splk_outliers_time_monitor_mlmodels_default = reqinfo["trackme_conf"][ "splk_outliers_detection" ]["splk_outliers_time_monitor_mlmodels_default"] # counter count = 0 # scoring metrics records scoring_metrics_records = [] # Get the session key session_key = self._metadata.searchinfo.session_key # Outliers rules storage collection collection_rules_name = f"kv_trackme_{self.component}_outliers_entity_rules_tenant_{str(self.tenant_id)}" collection_rule = self.service.kvstore[collection_rules_name] # Outliers data storage collection collection_data_name = f"kv_trackme_{self.component}_outliers_entity_data_tenant_{str(self.tenant_id)}" collection_data = self.service.kvstore[collection_data_name] # Get data kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } # # RUN # # report name for logging purposes report_name = f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}" # max runtime max_runtime = int(self.max_runtime) # Retrieve the search cron schedule savedsearch = self.service.saved_searches[report_name] savedsearch_cron_schedule = savedsearch.content["cron_schedule"] # get the cron_exec_sequence_sec try: cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule)) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"' ) cron_exec_sequence_sec = max_runtime # the max_runtime cannot be bigger than the cron_exec_sequence_sec if max_runtime > cron_exec_sequence_sec: max_runtime = cron_exec_sequence_sec logging.info( f'max_runtime="{max_runtime}", savedsearch_name="{report_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"' ) # If object_id is provided, resolve it to object value for the macro object_for_search = self.object if self.object_id != "*" and self.object == "*": # Query KVstore to get object value from object_id try: query_string_filter = { "object_category": f"splk-{self.component}", "_key": self.object_id, } query_string = {"$and": [query_string_filter]} records_outliers_rules = collection_rule.data.query( query=json.dumps(query_string) ) if records_outliers_rules: record_outliers_rules = records_outliers_rules[0] object_for_search = record_outliers_rules.get("object", "*") logging.debug( f'Resolved object_id="{self.object_id}" to object="{object_for_search}"' ) else: logging.warning( f'Could not resolve object_id="{self.object_id}" to object value, using "*"' ) object_for_search = "*" except Exception as e: logging.error( f'Failed to resolve object_id="{self.object_id}" to object value, exception="{str(e)}", using "*"' ) object_for_search = "*" # Define the search providing the list of entities which models need to be monitored if self.force_run == "False": search_query = remove_leading_spaces( f""" | `get_splk_outliers_entities("{self.tenant_id}", "{self.component}", "{object_for_search}")` | eval duration_since_last=if(isnum(last_exec_monitor), now()-last_exec_monitor, 0) | where duration_since_last=0 OR duration_since_last>={splk_outliers_time_monitor_mlmodels_default} | sort - duration_since_last """ ) else: search_query = remove_leading_spaces( f""" | `get_splk_outliers_entities("{self.tenant_id}", "{self.component}", "{object_for_search}")` | eval duration_since_last=if(isnum(last_exec_monitor), now()-last_exec_monitor, 0) | sort - duration_since_last """ ) logging.debug(f'search_query="{search_query}"') # run search try: reader = run_splunk_search( self.service, search_query, kwargs_oneshot, 24, 5, ) # loop through the results, and train models per entity # store processed entities in a list processed_entities = [] # Initialize sum of execution times and count of iterations total_execution_time = 0 iteration_count = 0 # Other initializations max_runtime = int(self.max_runtime) for item in reader: logging.debug(f'search_results="{item}"') current_time = time.time() elapsed_time = current_time - start if isinstance(item, dict): # iteration start iteration_start_time = time.time() # run the resulting search object_value = decode_unicode(item.get("object")) model_ids = item.get("model_id") # set the global_isOutlier global_isOutlier = 0 global_models_in_anomaly = [] global_isOutlierReason = [] # Define the KV query query_string_filter = { "object_category": f"splk-{self.component}", "object": object_value, } query_string = {"$and": [query_string_filter]} # Get the current record # Notes: the record is returned as an array, as we search for a specific record, we expect one record only key = None rules_key = None # Store rules key for logging purposes try: records_outliers_rules = collection_rule.data.query( query=json.dumps(query_string) ) record_outliers_rules = records_outliers_rules[0] key = record_outliers_rules.get("_key") rules_key = key # Store for logging except Exception as e: key = None rules_key = None record_outliers_rules = None # if no records, log a warning message and break if not key: object_ref = self._get_log_object_ref(object_value=self.object) msg = f'tenant_id="{self.tenant_id}", object_category="splk-{self.component}", {object_ref} outliers rules record cannot be found or are not yet available for this entity.' logging.warn(msg) break # # ML confidence # # retrieve the values for confidence and confidence_reason from the rules KVstore ml_confidence = record_outliers_rules.get("confidence", "low") # log debug logging.debug( f'record_outliers_rules="{record_outliers_rules}"' ) # Get the JSON outliers rules object entities_outliers = record_outliers_rules.get( "entities_outliers" ) # Load as a dict try: entities_outliers = json.loads( record_outliers_rules.get("entities_outliers") ) except Exception as e: msg = f'Failed to load entities_outliers with exception="{str(e)}"' # log debug logging.debug(f'entities_outliers="{entities_outliers}"') # Load the general enablement try: outliers_is_disabled = int( record_outliers_rules.get("is_disabled") ) logging.debug(f'is_disabled="{outliers_is_disabled}"') except Exception as e: msg = 'Failed to extract one or more expected settings from the entity, is this record corrupted? Exception="{}"' logging.error(msg) outliers_is_disabled = 1 # process the entity if general outliers is enabled if outliers_is_disabled == 0: # Process all ML models per entity processed_entity_models = {} if model_ids: model_ids = model_ids.split(",") # loop through each model per entity for model_id in model_ids: # model configuration try: model_config = entities_outliers[model_id] except Exception as e: model_config = None if model_config: logging.debug( f'configuration for model_id="{model_id}" config="{json.dumps(model_config, indent=4)}"' ) # If the model is enabled if int(model_config["is_disabled"]) == 0: # Get conf from the model alert_lower_breached = int( model_config["alert_lower_breached"] ) alert_upper_breached = int( model_config["alert_upper_breached"] ) # get the kpi metric name and value kpi_metric_name = model_config["kpi_metric"] logging.debug( f'kpi_metric_name="{kpi_metric_name}"' ) # retrieve the score from the model configuration try: score = int(model_config.get("score", 36)) except (ValueError, TypeError): score = 36 # Set the initial state for that model isOutlier = 0 isOutlierReason = "None" # Set the search - use object_id if available, otherwise fall back to object if key: object_param = f'object_id="{key}"' else: object_param = f'object="{object_value}"' model_render_search = remove_leading_spaces( f"""\ | trackmesplkoutliersrender tenant_id="{self.tenant_id}" component="{self.component}" {object_param} earliest="-24h" latest="now" model_id="{model_id}" allow_auto_train="{self.allow_auto_train}" | table _time, *, LowerBound, UpperBound | sort 0 - _time | head 1 """ ) logging.info( f'tenant_id="{self.tenant_id}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_id="{model_id}", Executing resulting search="{model_render_search}"' ) # set kwargs kwargs_oneshot = { "earliest_time": "-24h", "latest_time": "now", "search_mode": "normal", "preview": False, "count": 0, "output_mode": "json", } # Performance timer substart = time.time() # Run the search search_results = None try: reader = run_splunk_search( self.service, model_render_search, kwargs_oneshot, 24, 5, ) # loop through the reader results for item in reader: if isinstance(item, dict): search_results = item # raw results logged only in debug logging.debug( f'search_results="{search_results}"' ) # Inspect results time_outlier = search_results[ "_time" ] # get rejectedLowerboundOutlier / rejectedUpperboundOutlier / rejectedLowerboundOutlierReason / rejectedUpperboundOutlierReason try: rejectedLowerboundOutlier = int( ( search_results[ "rejectedLowerboundOutlier" ] ) ) except Exception as e: rejectedLowerboundOutlier = ( 0 ) try: rejectedUpperboundOutlier = int( ( search_results[ "rejectedUpperboundOutlier" ] ) ) except Exception as e: rejectedUpperboundOutlier = ( 0 ) try: rejectedLowerboundOutlierReason = search_results[ "rejectedLowerboundOutlierReason" ] except Exception as e: rejectedLowerboundOutlierReason = ( "N/A" ) try: rejectedUpperboundOutlierReason = search_results[ "rejectedUpperboundOutlierReason" ] except Exception as e: rejectedUpperboundOutlierReason = ( "N/A" ) # try to get the LowerBound and UpperBound try: LowerBound = search_results[ "LowerBound" ] except Exception as e: LowerBound = None logging.warning( f'Could not retrieve a LowerBound value from item="{item}"' ) try: UpperBound = search_results[ "UpperBound" ] except Exception as e: UpperBound = None logging.warning( f'Could not retrieve a UpperBound value from item="{item}"' ) try: kpi_metric_value = ( search_results[ kpi_metric_name ] ) logging.debug( f'kpi_metric_value="{kpi_metric_value}"' ) except Exception as e: kpi_metric_value = None logging.warning( f'Could not retrieve the kpi_metric_value from item="{item}"' ) # Define the outliers status if ( kpi_metric_value and LowerBound and UpperBound ): if int( alert_lower_breached ) == 1 and float( kpi_metric_value ) < float( LowerBound ): # Enforce policy for rejected lowerBound outliers if ( rejectedLowerboundOutlier == 1 ): isOutlier = 0 isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", LowerBound="{round(float(LowerBound), 3)}" has been rejected, rejectedLowerboundOutlierReason="{rejectedLowerboundOutlierReason}", kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", Outlier will not be considered.' # Accept Outlier else: isOutlier = 1 pct_decrease = ( ( float( LowerBound ) - float( kpi_metric_value ) ) / float( LowerBound ) ) * 100 isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", LowerBound="{round(float(LowerBound), 3)}" breached with kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", pct_decrease="{round(float(pct_decrease), 2)}"' # add to scoring metrics records scoring_metrics_records.append( { "tenant_id": self.tenant_id, "object_id": key, "object": object_value, "object_category": self.component, "score_source": f"lowerbound_outlier|model_id={model_id}", "metrics_event": {"score": score, "pct_decrease": round(float(pct_decrease), 2)}, } ) elif int( alert_upper_breached ) == 1 and float( kpi_metric_value ) > float( UpperBound ): # Enforce policy for rejected upperBound outliers if ( rejectedUpperboundOutlier == 1 ): isOutlier = 0 isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", UpperBound="{round(float(UpperBound), 3)}" has been rejected, rejectedUpperboundOutlierReason="{rejectedUpperboundOutlierReason}", kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", Outlier will not be considered.' # Accept Outlier else: isOutlier = 1 pct_increase = ( ( float( kpi_metric_value ) - float( UpperBound ) ) / float( UpperBound ) ) * 100 isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", UpperBound="{round(float(UpperBound), 3)}" breached with kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", pct_increase="{round(float(pct_increase), 2)}"' # add to scoring metrics records scoring_metrics_records.append( { "tenant_id": self.tenant_id, "object_id": key, "object": object_value, "object_category": self.component, "score_source": f"upperbound_outlier|model_id={model_id}", "metrics_event": {"score": score, "pct_increase": round(float(pct_increase), 2)}, } ) else: isOutlier = 0 isOutlierReason = "No outliers anomalies were detected" # impact the global_isOutlier accordingly if isOutlier == 1: # only if the confidence allows it if ( ml_confidence != "low" ): global_isOutlier = 1 global_models_in_anomaly.append( model_id ) global_isOutlierReason.append( isOutlierReason ) except Exception as e: error_msg = f'tenant_id="{self.tenant_id}", object_category="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_id="{model_id}", search has failed with the following exception="{str(e)}", search="{model_render_search}"' logging.error(error_msg) # Performance timer model_search_runtime = round( float(time.time()) - float(substart), 3 ) # try loading search results and define a message if did not produce any results try: summary_search_results = { "time": search_results["_time"], "_raw": json.loads( search_results["_raw"] ), "model_render_search": model_render_search, } except Exception as e: summary_search_results = "Outliers search did not produce any results" # Insert the summary record model_summary = { "isOutlier": isOutlier, "isOutlierReason": isOutlierReason, "alert_lower_breached": alert_lower_breached, "alert_upper_breached": alert_upper_breached, "summary_search_results": summary_search_results, "search_run_time": model_search_runtime, "time_exec": time.time(), "time_human": time.strftime( "%c", time.localtime(time.time()) ), } processed_entity_models[model_id] = ( model_summary ) # log info logging.info( f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_summary="{json.dumps(model_summary, indent=4)}"' ) # summary record for that entity processed_entity_record = { "entity": object_value, "processed_model_ids": processed_entity_models, } # append processed_entities.append(processed_entity_record) # increment the entity counter count += 1 # # Finally, update the outliers data KVstore # # Define the KV query query_string_filter = { "object_category": f"splk-{self.component}", "object": object_value, } query_string = {"$and": [query_string_filter]} # Get the current record # Notes: the record is returned as an array, as we search for a specific record, we expect one record only key = None try: records_outliers_data = collection_data.data.query( query=json.dumps(query_string) ) record_outliers_data = records_outliers_data[0] key = record_outliers_data.get("_key") except Exception as e: key = None if not key: # new record try: collection_data.data.insert( json.dumps( { "_key": hashlib.sha256( object_value.encode("utf-8") ).hexdigest(), "object": str(object_value), "object_category": f"splk-{self.component}", "mtime": str(time.time()), "isOutlier": global_isOutlier, "isOutlierReason": global_isOutlierReason, "models_in_anomaly": global_models_in_anomaly, "models_summary": json.dumps( processed_entity_models, indent=4, ), } ) ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, failed to insert a new KVstore record with exception="{str(e)}"' ) else: try: # update existing record collection_data.data.update( str(key), json.dumps( { "object": str(object_value), "object_category": f"splk-{self.component}", "mtime": str(time.time()), "isOutlier": global_isOutlier, "isOutlierReason": global_isOutlierReason, "models_in_anomaly": global_models_in_anomaly, "models_summary": json.dumps( processed_entity_models, indent=4, ), } ), ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, failed to update a KVstore record with exception="{str(e)}"' ) # Calculate the execution time for this iteration iteration_end_time = time.time() execution_time = iteration_end_time - iteration_start_time # Update total execution time and iteration count total_execution_time += execution_time iteration_count += 1 # Calculate average execution time if iteration_count > 0: average_execution_time = ( total_execution_time / iteration_count ) else: average_execution_time = 0 # Check if there is enough time left to continue current_time = time.time() elapsed_time = current_time - start if elapsed_time + average_execution_time + 120 >= max_runtime: logging.info( f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now' ) break # end if int(count) > 0: logging.info( f'tenant_id="{self.tenant_id}" outliers tracker job successfully executed, status="success", run_time="{round(time.time() - start, 3)}", report="{str(report_name)}", entities_count="{str(count)}"' ) # yield results_dict = { "tenant_id": self.tenant_id, "action": "success", "results": "outliers tracker job successfully executed", "run_time": round((time.time() - start), 3), "entities_count": str(count), "processed_entities": processed_entities, "upstream_search_query": search_query, } yield {"_time": time.time(), "_raw": results_dict} # handler event handler_events_records = [] for object_record in processed_entities: handler_events_records.append( { "object": object_record.get("entity"), "object_id": hashlib.sha256( object_record.get("entity").encode("utf-8") ).hexdigest(), "object_category": f"splk-{self.component}", "handler": f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}", "handler_message": "Entity was rendered for ML Outliers.", "handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmesplkoutliersrender tenant_id={self.tenant_id} object=\"{object_record.get('entity')}\"", "handler_time": time.time(), } ) # notification event try: trackme_handler_events( session_key=self._metadata.searchinfo.session_key, splunkd_uri=self._metadata.searchinfo.splunkd_uri, tenant_id=self.tenant_id, sourcetype="trackme:handler", source=f"trackme:handler:{self.tenant_id}", handler_events=handler_events_records, ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"' ) # call the scoring gen metrics function scoring_metrics_gen_start = time.time() try: scoring_metrics = trackme_scoring_gen_metrics( self.tenant_id, tenant_indexes.get("trackme_metric_idx"), scoring_metrics_records, ) logging.info( f'context="scoring_gen_metrics", tenant_id="{self.tenant_id}", function trackme_scoring_gen_metrics success {scoring_metrics}, run_time={round(time.time()-scoring_metrics_gen_start, 3)}, no_entities={len(scoring_metrics_records)}' ) except Exception as e: logging.error( f'context="scoring_gen_metrics", tenant_id="{self.tenant_id}", function trackme_scoring_gen_metrics failed with exception {str(e)}' ) # also generate events for the score for score_event in scoring_metrics_records: try: trackme_gen_state( index=tenant_indexes.get("trackme_summary_idx"), sourcetype="trackme:score", source=f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}", event=score_event, ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="{self.component}", failed to generate score state event, exception="{str(e)}"' ) else: logging.info( f'tenant_id="{self.tenant_id}" outliers tracker job successfully executed but there were no entities to be tracked at this time, status="success", run_time="{round(time.time() - start, 3)}", report="{str(report_name)}", entities_count="{str(count)}"' ) # yield results_dict = { "tenant_id": self.tenant_id, "action": "success", "results": "outliers tracker job successfully executed but there were no entities to be tracked at this time", "run_time": round((time.time() - start), 3), "entities_count": str(count), "upstream_search_query": search_query, } yield {"_time": time.time(), "_raw": results_dict} # Call the component register trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, f"splk-{self.component}", f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{str(self.tenant_id)}", "success", time.time(), round(time.time() - start, 3), "The report was executed successfully", "-24h", "now", ) except Exception as e: trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, f"splk-{self.component}", f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{str(self.tenant_id)}", "failure", time.time(), round(time.time() - start, 3), str(e), "-24h", "now", ) msg = f'tenant_id="{self.tenant_id}", permanent search failure, exception="{str(e)}", search_query="{search_query}", search_kwargs="{kwargs_oneshot}"' logging.error(msg) raise Exception(msg) dispatch(SplkOutliersTracker, sys.argv, sys.stdin, sys.stdout, __name__)