#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Standard library imports import os import sys import time import json import hashlib # Logging imports import logging from logging.handlers import RotatingFileHandler # Networking imports import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # splunk home splunkhome = os.environ["SPLUNK_HOME"] # set logging filehandler = RotatingFileHandler( "%s/var/log/splunk/trackme_splk_outliers_train_helper.log" % splunkhome, mode="a", maxBytes=10000000, backupCount=1, ) formatter = logging.Formatter( "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s" ) logging.Formatter.converter = time.gmtime filehandler.setFormatter(formatter) log = logging.getLogger() # root logger - Good to get it only once. for hdlr in log.handlers[:]: # remove the existing file handlers if isinstance(hdlr, logging.FileHandler): log.removeHandler(hdlr) log.addHandler(filehandler) # set the new handler # set the log level to INFO, DEBUG as the default is ERROR log.setLevel(logging.INFO) # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # import Splunk libs from splunklib.searchcommands import ( dispatch, GeneratingCommand, Configuration, Option, validators, ) # import trackme libs from trackme_libs import ( trackme_reqinfo, trackme_vtenant_account, trackme_vtenant_component_info, trackme_register_tenant_object_summary, run_splunk_search, trackme_handler_events, ) # import trackme libs croniter from trackme_libs_croniter import cron_to_seconds # import trackme libs utils from trackme_libs_utils import remove_leading_spaces @Configuration(distributed=False) class SplkOutliersExecutor(GeneratingCommand): tenant_id = Option( doc=""" **Syntax:** **tenant_id=**** **Description:** The tenant identifier.""", require=True, default=None, ) component = Option( doc=""" **Syntax:** **component=**** **Description:** The component category.""", require=True, default=None, validate=validators.Match("component", r"^(?:dsm|dhm|flx|fqm|wlk)$"), ) earliest = Option( doc=""" **Syntax:** **earliest=**** **Description:** The earliest time quantifier.""", require=False, default="-24h", ) latest = Option( doc=""" **Syntax:** **latest=**** **Description:** The latest time quantifier.""", require=False, default="now", ) max_runtime_sec = Option( doc=""" **Syntax:** **max_runtime_sec=**** **Description:** The max runtime for the job in seconds, defaults to 60 minutes less 120 seconds of margin.""", require=False, default="3600", validate=validators.Match("max_runtime_sec", r"^\d*$"), ) def generate(self, **kwargs): # performance counter start = time.time() # Track execution times average_execution_time = 0 # Get request info and set logging level reqinfo = trackme_reqinfo( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri ) log.setLevel(reqinfo["logging_level"]) # Get Virtual Tenant account vtenant_account = trackme_vtenant_account( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) # get vtenant component info vtenant_component_info = trackme_vtenant_component_info( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) logging.debug( f'vtenant_component_info="{json.dumps(vtenant_component_info, indent=2)}"' ) # check schema version migration state try: schema_version = int(vtenant_component_info["schema_version"]) schema_version_upgrade_in_progress = bool( int(vtenant_component_info["schema_version_upgrade_in_progress"]) ) logging.debug( f'schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"' ) except Exception as e: schema_version = 0 schema_version_upgrade_in_progress = False logging.error( f'failed to retrieve schema_version_upgrade_in_progress=, exception="{str(e)}"' ) # Do not proceed if the schema version upgrade is in progress if schema_version_upgrade_in_progress: yield_json = { "_time": time.time(), "tenant_id": self.tenant_id, "component": self.component, "response": f'tenant_id="{self.tenant_id}", schema upgrade is currently in progress, we will wait until the process is completed before proceeding, the schema upgrade is handled by the health_tracker of the tenant and is completed once the schema_version field of the Virtual Tenants KVstore (trackme_virtual_tenants) matches TrackMe\'s version, schema_version="{schema_version}", schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"', "schema_version": schema_version, "schema_version_upgrade_in_progress": schema_version_upgrade_in_progress, } logging.info(json.dumps(yield_json, indent=2)) yield { "_time": yield_json["_time"], "_raw": yield_json, } # get mloutliers, if set 0 then ML Outliers is disabled for the tenant, 1 we can proceed # Default to True (ML Outliers enabled) outliers_feature_enabled = True # Define the valid components valid_components = {"dsm", "dhm", "flx", "fqm", "wlk"} # Construct the key dynamically key = f"mloutliers_{self.component}" # Check if the component is valid and handle exceptions if self.component in valid_components: try: logging.debug( f'checking if the key "{key}" exists in the vtenant_account' ) outliers_enablement = int(vtenant_account.get(key, 1)) logging.debug( f'vtenant_account="{json.dumps(vtenant_account, indent=2)}", component="{self.component}", key="{key}", outliers_enablement="{outliers_enablement}"' ) if outliers_enablement == 0: outliers_feature_enabled = False logging.debug( f'the key "{key}" exists in the vtenant_account and is set to 0, ML Outliers is disabled for the tenant' ) except (ValueError, TypeError): outliers_feature_enabled = True else: logging.error( f'component="{self.component}" is not valid, valid components are {valid_components}' ) if not outliers_feature_enabled or schema_version_upgrade_in_progress: if not outliers_feature_enabled: # yield and log results_dict = { "tenant_id": self.tenant_id, "action": "success", "results": "ML Anomaly Detection feature is disabled for the tenant and component, no action taken", "vtenant_account": vtenant_account, } yield {"_time": time.time(), "_raw": results_dict} else: # procees # max runtime max_runtime = int(self.max_runtime_sec) # Retrieve the search cron schedule savedsearch_name = f"trackme_{self.component}_outliers_mltrain_tracker_tenant_{self.tenant_id}" savedsearch = self.service.saved_searches[savedsearch_name] savedsearch_cron_schedule = savedsearch.content["cron_schedule"] # get the cron_exec_sequence_sec try: cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule)) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"' ) cron_exec_sequence_sec = max_runtime # the max_runtime cannot be bigger than the cron_exec_sequence_sec if max_runtime > cron_exec_sequence_sec: max_runtime = cron_exec_sequence_sec logging.info( f'max_runtime="{max_runtime}", savedsearch_name="{savedsearch_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"' ) # Get app level config splk_outliers_time_train_mlmodels_default = reqinfo["trackme_conf"][ "splk_outliers_detection" ]["splk_outliers_time_train_mlmodels_default"] splk_outliers_max_runtime_train_mlmodels_default = reqinfo["trackme_conf"][ "splk_outliers_detection" ]["splk_outliers_max_runtime_train_mlmodels_default"] # Get the session key session_key = self._metadata.searchinfo.session_key # Get data kwargs_oneshot = { "earliest_time": self.earliest, "latest_time": self.latest, "output_mode": "json", "count": 0, } # # RUN # # Define the search providing the list of entities which models need to be train # note: do not allow crap entities, sometimes users get very fancy, entities with double quotes in the value # are not expected nor desirable, do not allow crap to get in if self.component in ("dsm", "dhm", "flx", "fqm"): search_query = remove_leading_spaces( f"""\ | inputlookup trackme_{self.component}_outliers_entity_rules_tenant_{self.tenant_id} where object_category="splk-{self.component}" | `trackme_exclude_badentities` | lookup local=t trackme_{self.component}_tenant_{self.tenant_id} object OUTPUT monitored_state | where monitored_state="enabled" | eval duration_since_last=if(last_exec!="pending", now()-last_exec, 0) | where duration_since_last=0 OR duration_since_last>={splk_outliers_time_train_mlmodels_default} | sort - duration_since_last """ ) elif self.component in ("wlk"): search_query = remove_leading_spaces( f"""\ | inputlookup trackme_{self.component}_outliers_entity_rules_tenant_{self.tenant_id} where object_category="splk-{self.component}" | `trackme_exclude_badentities` | lookup local=t trackme_{self.component}_tenant_{self.tenant_id} object OUTPUT app, monitored_state | lookup local=t trackme_{self.component}_apps_enablement_tenant_{self.tenant_id} app OUTPUT enabled as app_enabled | where monitored_state="enabled" AND app_enabled="True" | eval duration_since_last=if(last_exec!="pending", now()-last_exec, 0) | where duration_since_last=0 OR duration_since_last>={splk_outliers_time_train_mlmodels_default} | sort - duration_since_last """ ) logging.debug( f'tenant_id="{self.tenant_id}", component="{self.component}", retrieving the list of entities to be trained from the upstream search="{search_query}"' ) # run search try: reader = run_splunk_search( self.service, search_query, kwargs_oneshot, 24, 5, ) # loop through the results, and train models per entity # Store various key information processed_entities = [] failures_entities = [] failures_entities_object_list = [] searches_toprocess = [] for item in reader: logging.debug(f'search_results="{item}"') current_time = time.time() elapsed_time = current_time - start if isinstance(item, dict): # break if reaching the max run time less 30 seconds of margin if (time.time() - int(start)) - 30 >= int( splk_outliers_max_runtime_train_mlmodels_default ): logging.info( f'tenant_id="{self.tenant_id}" max_runtime="{splk_outliers_max_runtime_train_mlmodels_default}" for ML models was reached with current_runtime="{start}", job will be terminated now' ) break # set the search depending on the component search_train = f'| trackmesplkoutlierstrain tenant_id="{self.tenant_id}" component="{self.component}" object="{item.get("object")}"' # append to our list searches_toprocess.append( { "object_category": item.get("object_category"), "object": item.get("object"), "search": search_train, } ) logging.debug( f'entity_dict="{json.dumps({"object_category": item.get("object_category"), "object": item.get("object"), "search": search_train}, indent=2)}"' ) except Exception as e: msg = f'tenant_id="{self.tenant_id}", main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) # # Process # logging.debug( f'searched to be processed="{json.dumps(searches_toprocess, indent=2)}"' ) # end # errors counter search_errors_count = 0 # Initialize sum of execution times and count of iterations total_execution_time = 0 iteration_count = 0 # Other initializations max_runtime = int(self.max_runtime_sec) - 120 # run if len(searches_toprocess) > 0: # # process searches # for search_record in searches_toprocess: search = search_record.get("search") object_category = search_record.get("object_category") object_value = search_record.get("object") results_entity = [] # iteration start iteration_start_time = time.time() logging.debug( f'tenant_id="{self.tenant_id}" Executing resulting search="{search}"' ) # run search substart = time.time() try: reader = run_splunk_search( self.service, search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): logging.debug(f'search_results="{item}"') results_entity.append(item) # don't be too noisy if reqinfo["logging_level"] == "DEBUG": processed_entities.append( { "object_category": object_category, "object": object_value, "results_entity": results_entity, "search": search, "runtime": str(time.time() - substart), } ) else: processed_entities.append( { "object_category": object_category, "object": object_value, "search": search, "runtime": str(time.time() - substart), } ) # only in debug logging.debug( f'tenant_id="{self.tenant_id}" search successfully executed in {time.time() - substart} seconds' ) except Exception as e: msg = f'tenant_id="{self.tenant_id}", component="{self.component}", search="{search}", main search failed with exception="{str(e)}"' logging.error(msg) search_errors_count += 1 failures_entities.append( { "object_category": object_category, "object": object_value, "results_entity": { "action": "failure", "search": search, "exception": str(e), }, } ) failures_entities_object_list.append( {"object": object_value, "exception": str(e)} ) # Calculate the execution time for this iteration iteration_end_time = time.time() execution_time = iteration_end_time - iteration_start_time # Update total execution time and iteration count total_execution_time += execution_time iteration_count += 1 # Calculate average execution time if iteration_count > 0: average_execution_time = total_execution_time / iteration_count else: average_execution_time = 0 # Check if there is enough time left to continue current_time = time.time() elapsed_time = current_time - start if elapsed_time + average_execution_time + 120 >= max_runtime: logging.info( f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now' ) break # # end process searches loop # # yield and log results_dict = { "tenant_id": self.tenant_id, "action": "success", "results": "outliers models training job successfully executed", "run_time": round((time.time() - start), 3), "entities_count": len(searches_toprocess), "processed_entities": processed_entities, "failures_entities": failures_entities, "search_errors_count": search_errors_count, "upstream_search_query": search_query, } yield {"_time": time.time(), "_raw": results_dict} logging.info(json.dumps(results_dict, indent=2)) # handler event handler_events_records = [] for object_record in processed_entities: handler_events_records.append( { "object": object_record.get("object"), "object_id": hashlib.sha256( object_record.get("object").encode("utf-8") ).hexdigest(), "object_category": f"splk-{self.component}", "handler": f"trackme_{self.component}_outliers_mltrain_tracker_tenant_{self.tenant_id}", "handler_message": "Entity was trained for ML Outliers.", "handler_troubleshoot_search": f"index=_internal (sourcetype=trackme:custom_commands:trackmesplkoutlierstrain) tenant_id={self.tenant_id} object=\"{object_record.get('object')}\"", "handler_time": time.time(), } ) # notification event try: trackme_handler_events( session_key=self._metadata.searchinfo.session_key, splunkd_uri=self._metadata.searchinfo.splunkd_uri, tenant_id=self.tenant_id, sourcetype="trackme:handler", source=f"trackme:handler:{self.tenant_id}", handler_events=handler_events_records, ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"' ) else: # yield and log results_dict = { "tenant_id": self.tenant_id, "action": "success", "results": "outliers models training job successfully executed but there were no entities to be trained at this time", "run_time": round((time.time() - start), 3), "entities_count": len(searches_toprocess), "upstream_search_query": search_query, } yield {"_time": time.time(), "_raw": results_dict} logging.info(json.dumps(results_dict, indent=2)) # Call the component register report_name = f"trackme_{self.component}_outliers_mltrain_tracker_tenant_{self.tenant_id}" if search_errors_count == 0: trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, f"splk-{self.component}", report_name, "success", time.time(), round(time.time() - start, 3), "The report was executed successfully", "-24h", "now", ) else: trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, f"splk-{self.component}", report_name, "failure", time.time(), round(time.time() - start, 3), failures_entities_object_list, "-24h", "now", ) dispatch(SplkOutliersExecutor, sys.argv, sys.stdin, sys.stdout, __name__)