#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Standard library imports import os import sys import time import json import uuid import datetime # Logging imports import logging from logging.handlers import RotatingFileHandler # Networking imports import requests import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # splunk home splunkhome = os.environ["SPLUNK_HOME"] # set logging filehandler = RotatingFileHandler( "%s/var/log/splunk/trackme_general_health_manager.log" % splunkhome, mode="a", maxBytes=10000000, backupCount=1, ) formatter = logging.Formatter( "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s" ) logging.Formatter.converter = time.gmtime filehandler.setFormatter(formatter) log = logging.getLogger() # root logger - Good to get it only once. for hdlr in log.handlers[:]: # remove the existing file handlers if isinstance(hdlr, logging.FileHandler): log.removeHandler(hdlr) log.addHandler(filehandler) # set the new handler # set the log level to INFO, DEBUG as the default is ERROR log.setLevel(logging.INFO) # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # import Splunk libs from splunklib.searchcommands import ( dispatch, GeneratingCommand, Configuration, Option, validators, ) # import trackme libs from trackme_libs import ( trackme_reqinfo, ) # import trackme libs from trackme_libs import ( run_splunk_search, trackme_manage_report_schedule, trackme_report_update_enablement, ) # import trackme libs utils from trackme_libs_utils import remove_leading_spaces # import TrackMe get data libs from trackme_libs_get_data import get_full_kv_collection # import the collections dict from collections_data import collections_dict from collections_data import ( collections_list_dsm, collections_list_flx, collections_list_fqm, collections_list_dhm, collections_list_mhm, collections_list_wlk, collections_list_common, ) # logging: # To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition # and rely on callers themselves @Configuration(distributed=False) class HealthTracker(GeneratingCommand): @staticmethod def safe_create_datetime(year, month, day, hour=0, minute=0, second=0, tzinfo=None): """ Safely create a datetime object, handling leap years. If trying to create Feb 29 in a non-leap year, falls back to Feb 28. Args: year: Year month: Month (1-12) day: Day of month hour: Hour (default 0) minute: Minute (default 0) second: Second (default 0) tzinfo: Timezone info (default None) Returns: datetime.datetime object """ # Check if this is Feb 29 and the year is not a leap year if month == 2 and day == 29: # Check if year is a leap year is_leap_year = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0) if not is_leap_year: # Fall back to Feb 28 for non-leap years day = 28 logging.debug(f'Leap year adjustment: Feb 29 in non-leap year {year} adjusted to Feb 28') return datetime.datetime(year, month, day, hour, minute, second, tzinfo=tzinfo) def get_uuid(self): """ Function to return a unique uuid which is used to trace performance run_time of each subtask. """ return str(uuid.uuid4()) def get_ml_rules_collection(self, collection): """ Get all records from an ML rules collection. :param collection: The collection to query. :return: A list of records, a dictionary of records, a list of keys. """ collection_records = [] collection_records_dict = {} count_to_process_list = [] end = False skip_tracker = 0 while not end: process_collection_records = collection.data.query(skip=skip_tracker) if process_collection_records: for item in process_collection_records: collection_records.append(item) collection_records_dict[item.get("_key")] = ( item # Add the entire item to the dictionary ) count_to_process_list.append(item.get("_key")) skip_tracker += 5000 else: end = True return collection_records, collection_records_dict, count_to_process_list def remove_ml_model( self, component, rest_url, header, ml_model_lookup_name, instance_id=None, task_name=None, task_instance_id=None, ): """ Removes an orphan Machine Learning model from the collection. :param component: The component name. :param rest_url: The REST URL to use. :param header: The header to use. :param ml_model_lookup_name: The Machine Learning model lookup name. :param instance_id: The instance ID for logging purposes. :param task_name: The task name for logging purposes. :param task_instance_id: The task instance ID for logging purposes. :return: True if the model was removed successfully, otherwise False. """ logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{component}", attempting to delete orphan Machine Learning lookup_name="{ml_model_lookup_name}"' ) try: response = requests.delete( rest_url, headers=header, verify=False, timeout=600, ) if response.status_code not in ( 200, 201, 204, ): error_msg = f'failure to delete ML lookup_name="{ml_model_lookup_name}", url="{rest_url}", response.status_code="{response.status_code}", response.text="{response.text}"' raise Exception(error_msg) else: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="success", deleted lookup_name="{ml_model_lookup_name}" successfully' ) return True except Exception as e: error_msg = f'failure to delete ML lookup_name="{ml_model_lookup_name}" with exception="{str(e)}"' raise Exception(error_msg) def reassign_ml_model( self, model_id, rest_url, header, instance_id=None, task_name=None, task_instance_id=None, ): """ Reasign a Machine Learning model to the Splunk system user. :param model_id: The model_id to reassign. :param rest_url: The REST URL to use. :param header: The header to use. :param instance_id: The instance ID for logging purposes. :param task_name: The task name for logging purposes. :param task_instance_id: The task instance ID for logging purposes. :return: True if the model was reassigned successfully, otherwise False. """ logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting to re-assign model_id="{model_id}" to splunk-system-user' ) acl_properties = { "sharing": "user", "owner": "splunk-system-user", } # proceed boolean proceed = False # before re-assigning, check if the model exist by running a GET request, if the status code is different from 2**, do not proceed and log an informational message instead try: response = requests.get( f"{rest_url}", headers=header, verify=False, timeout=600, ) if response.status_code not in (200, 201, 204): logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, model_id="{model_id}" does not exist, it might have been re-assigned in the meantime, skipping re-assignment' ) return False else: proceed = True except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, model_id="{model_id}" failed to retrieve model, exception="{str(e)}"' ) if proceed: try: response = requests.post( f"{rest_url}/acl", headers=header, data=acl_properties, verify=False, timeout=600, ) if response.status_code not in ( 200, 201, 204, ): error_msg = f'failure to reassign model_id="{model_id}", url="{rest_url}", response.status_code="{response.status_code}", response.text="{response.text}"' raise Exception(error_msg) else: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="success", model_id="{model_id}" reassigned successfully' ) return True except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="failure", model_id="{model_id}" reassigned failed, exception="{str(e)}"' ) raise Exception(str(e)) def get_all_accounts(self, instance_id=None, task_name=None, task_instance_id=None): """ Update the configuration of any exising remote account, to ensure that the configuration is up to date. :param instance_id: The instance ID for logging purposes. :param task_name: The task name for logging purposes. :param task_instance_id: The task instance ID for logging purposes. :return: A list of remote accounts. """ # endpoint target url = f"{self._metadata.searchinfo.splunkd_uri}/servicesNS/nobody/trackme/trackme_account" # current_remote_accounts_list current_remote_accounts_list = [] # first, get the list of remote accounts try: response = requests.get( url, headers={ "Authorization": f"Splunk {self._metadata.searchinfo.session_key}", "Content-Type": "application/json", }, verify=False, params={ "output_mode": "json", "count": -1, }, timeout=600, ) response.raise_for_status() response_json = response.json() # The list of remote accounts is stored as a list in entry remote_accounts = response_json.get("entry", []) # iterate through the remote accounts, adding them to the dict, name is the key, then we care about "content" which is a dict of our parameters # for this account for remote_account in remote_accounts: remote_account_name = remote_account.get("name", None) # add to list current_remote_accounts_list.append(remote_account_name) return current_remote_accounts_list except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, error while fetching remote account list: {str(e)}' ) return [] # # Main # def generate(self, **kwargs): if self: # performance counter global_start = time.time() # set instance_id instance_id = self.get_uuid() # Get request info and set logging level reqinfo = trackme_reqinfo( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, ) log.setLevel(reqinfo["logging_level"]) # Splunk header for REST requests header = { "Authorization": f"Splunk {self._metadata.searchinfo.session_key}", "Content-Type": "application/json", } logging.info( # First log message f'context="general_execution", trackmegeneralhealthmanager is starting now.' ) # global_results_dict to store results of the execution global_results_dict = {} # Register the object summary in the vtenant collection collection_vtenants_name = "kv_trackme_virtual_tenants" collection_vtenants = self.service.kvstore[collection_vtenants_name] # get all vtenants records, this job is not tenant specific vtenant_records = collection_vtenants.data.query() ############################################################ # Machine Learning related global health manager tasks # Goals: # - Inspect all ML collections, identify orphans models, # and reassign if necessary ############################################################ # Reassignment: Ensures that all ML models are owned by splunk-system-user, amd re-assign otherwise # run the following search to retrieve the list of existing ML models task_start = time.time() task_instance_id = self.get_uuid() task_name = "mlmodels-management:splunk-system-user_reassignment" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # counters ml_models_reassigned_success_count = 0 ml_models_reassigned_failures_count = 0 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting verification of ML models ownership and reassignment if necessary' ) # Define the query search = f'| rest splunk_server=local timeout=1200 "/servicesNS/nobody/trackme/data/lookup-table-files" | search eai:acl.app="trackme" AND title="__mlspl_model_*.mlmodel" | table title, id' kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } # A list to store current ml models (filename) ml_models_for_reassignement_current_list = [] # A dict to store the existing models ml_models_for_reassignement_dict_existing = {} try: reader = run_splunk_search( self.service, search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): ml_models_for_reassignement_current_list.append( item.get("title") ) # this is the model filename ml_models_for_reassignement_dict_existing[item.get("title")] = { "id": item.get("id") } except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the list of ML models, exception="{str(e)}"' ) # Loop for model_id in ml_models_for_reassignement_current_list: # reassign the model rest_url = ml_models_for_reassignement_dict_existing[model_id].get("id") logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting reassignment of model_id={model_id}"' ) try: reassigned_model = self.reassign_ml_model( model_id, rest_url, header, instance_id, task_name, task_instance_id, ) if reassigned_model: ml_models_reassigned_success_count += 1 except Exception as e: ml_models_reassigned_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to reassign the model, model_id="{model_id}", exception="{str(e)}"' ) ############################################################ # Identify ML models owned by splunk-system-user ############################################################ # run the following search to retrieve the list of existing ML models logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting verification of ML models ownership and reassignment if necessary' ) # Define the query search = f'| rest splunk_server=local timeout=1200 "/servicesNS/splunk-system-user/trackme/data/lookup-table-files" | search eai:acl.app="trackme" AND title="__mlspl_model_*.mlmodel" | table title, id' kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } # A list to store current ml models (filename) ml_models_current_list = [] # A dict to store the existing models ml_models_dict_existing = {} try: reader = run_splunk_search( self.service, search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): ml_models_current_list.append( item.get("title") ) # this is the model filename ml_models_dict_existing[item.get("title")] = { "id": item.get("id") } except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the list of ML models, exception="{str(e)}"' ) ############################################################ # Identify ML models configured in TrackMe ############################################################ # A list to store ml_rules_outliers_collections ml_rules_outliers_collections = [] # A dict to ml models definitions ml_models_dict = {} # A list to store ml models currently configured ml_models_list = [] for vtenant_record in vtenant_records: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing vtenant_record={json.dumps(vtenant_record, indent=2)}' ) # get the tenant_id tenant_id = vtenant_record.get("tenant_id") # for component in dsm, dhm, flx, fqm, wlk for component in ["dsm", "dhm", "flx", "fqm", "wlk"]: # get status component_status = vtenant_record.get(f"tenant_{component}_enabled") # append the collection if component_status == 1: ml_rules_outliers_collections.append( f"kv_trackme_{component}_outliers_entity_rules_tenant_{tenant_id}" ) # for each outliers rules collection for ml_rules_collection_name in ml_rules_outliers_collections: # connect to the collection service and retrieve the records ml_rules_collection = self.service.kvstore[ml_rules_collection_name] # extract ml_rules_tenant_id from the collection name: trackme__outliers_entity_rules_tenant_ ml_rules_tenant_id = ml_rules_collection_name.split("_")[-1] # get records try: ml_rules_records, ml_rules_records_dict, ml_rules_records_count = ( self.get_ml_rules_collection(ml_rules_collection) ) for ml_rules_record in ml_rules_records: # get key ml_rules_record_key = ml_rules_record.get("_key") # get dictionnary entities_outliers from the field entities_outliers entities_outliers = json.loads( ml_rules_record.get("entities_outliers") ) # loop trough entities_outliers, the dict key is the model_id for ml_model_entity in entities_outliers: ml_models_dict[ml_model_entity] = { "model_id": ml_model_entity, "collection_name": ml_rules_collection_name, "collection_key": ml_rules_record_key, "tenant_id": ml_rules_tenant_id, } ml_models_list.append( f"__mlspl_{ml_model_entity}.mlmodel" ) # this is the filename except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the records from the collection, collection_name="{ml_rules_collection_name}", exception="{str(e)}"' ) # log logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {len(ml_models_dict)} ML models were found configured in TrackMe collections, will now start inspecting Splunk existing models.' ) # log the number of currently existing models logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {len(ml_models_current_list)} ML models were found in the system, starting orphan models inspection' ) # # orphan models purge / reassign # ml_models_purged_success_count = 0 ml_models_purged_failures_count = 0 # for each model in ml_models_current_list, if the model is not in ml_models_list, delete it for model_id in ml_models_current_list: if model_id not in ml_models_list and not model_id == "pending": # remove the model rest_url = ml_models_dict_existing[model_id].get("id") logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting removal of model_id={model_id}"' ) try: self.remove_ml_model( "trackme", rest_url, header, model_id, instance_id, task_name, task_instance_id, ) ml_models_purged_success_count += 1 except Exception as e: ml_models_purged_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to remove the orphan model, model_id="{model_id}", exception="{str(e)}"' ) # end context="mlmodels-management" # log logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {ml_models_purged_success_count} orphan ML models were removed, {ml_models_purged_failures_count} orphan ML models removals failed, {ml_models_reassigned_success_count} ML models were reassigned to splunk-system-user, {ml_models_reassigned_failures_count} ML models reassignments failed' ) # add to results global_results_dict["mlmodels_management"] = { "ml_models_in_system_count": len(ml_models_current_list), "ml_models_configured_count": len(ml_models_list), "ml_models_purged_success_count": ml_models_purged_success_count, "ml_models_purged_failures_count": ml_models_purged_failures_count, "ml_models_reassigned_success_count": ml_models_reassigned_success_count, "ml_models_reassigned_failures_count": ml_models_reassigned_failures_count, "result": f"{ml_models_purged_success_count} orphan ML models were removed, {ml_models_purged_failures_count} orphan ML models removals failed, {ml_models_reassigned_success_count} ML models were reassigned to splunk-system-user, {ml_models_reassigned_failures_count} ML models reassignments failed", } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # End Machine Learning related global health manager tasks ############################################################ ############################################################ # Splunk Remote Accounts maintenance # Goals: # - Calls the associated REST endpoint for each existing account, # to verify, update account parameters if needed, and peform tokens # rotation if needed ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "splunk-remote-accounts:verify_and_maintain_accounts" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, context="splunk-remote-accounts", starting verification and maintenance of Splunk remote accounts' ) # get all accounts current_remote_accounts_list = self.get_all_accounts( instance_id, task_name, task_instance_id ) # remote_accounts_maintenance_dict remote_accounts_maintenance_dict = {} # Loop through accounts, and call the endpoint for account in current_remote_accounts_list: url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/configuration/admin/maintain_remote_account" try: response = requests.post( url, headers={ "Authorization": f"Splunk {self._metadata.searchinfo.session_key}", "Content-Type": "application/json", }, verify=False, data=json.dumps( { "accounts": account, } ), timeout=600, ) response.raise_for_status() response_json = response.json() remote_accounts_maintenance_dict[account] = response_json except Exception as e: error_msg = f'error calling endpoint, exception="{str(e)}"' logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {error_msg}' ) remote_accounts_maintenance_dict[account] = error_msg # add to global_results_dict, if the dict is empty, add a message to the global_results_dict as we had no actions to perform if not remote_accounts_maintenance_dict: global_results_dict[f"{task_name}"] = { "message": "No actions to perform." } else: global_results_dict[f"{task_name}"] = ( remote_accounts_maintenance_dict ) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # TrackMe Virtual Tenants auto-repair # Goals: # - For each enable Virtual Tenant, verify that all expected # are effectively available in the system. (KV collections...) # - If for some reasons an expected object is missing, # auto-repair will attempt to create it and fix the tenant inconsistency. ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "virtual_tenants:auto-repair:collections_and_transforms" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # init auto_repair_actions_list auto_repair_actions_list = [] # A dict to store objects that were verified and their status, per tenant_id as the key tenants_objects_status_dict = {} # collections map per component, including common collections collections_map_per_component = { "dsm": collections_list_dsm, "dhm": collections_list_dhm, "mhm": collections_list_mhm, "flx": collections_list_flx, "fqm": collections_list_fqm, "wlk": collections_list_wlk, "common": collections_list_common, # Add common collections } for vtenant_record in vtenant_records: # get the tenant_id tenant_id = vtenant_record.get("tenant_id") # check if tenant is a replica tenant, if so, skip it if vtenant_record.get("tenant_replica", 0) == 1: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.' ) continue # init tenant_id_checked_status_dict tenant_id_checked_status_dict = {} # get RBAC tenant_owner = str(vtenant_record.get("tenant_owner")) tenant_roles_admin = str(vtenant_record.get("tenant_roles_admin")) tenant_roles_user = str(vtenant_record.get("tenant_roles_user")) tenant_roles_power = str(vtenant_record.get("tenant_roles_power")) # TrackMe sharing level trackme_default_sharing = reqinfo["trackme_conf"]["trackme_general"][ "trackme_default_sharing" ] # for read permissions, concatenate admin, power and user tenant_roles_read_perms = ( f"{tenant_roles_admin},{tenant_roles_power},{tenant_roles_user}" ) # for write permissions, concatenate admin, power tenant_roles_write_perms = f"{tenant_roles_admin},{tenant_roles_power}" # for component in dsm, dhm, flx, fqm, wlk and common for component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk", "common"]: # get status try: component_status = int( vtenant_record.get( f"tenant_{component}_enabled", 1 ) # Default to 1 for common ) except Exception as e: component_status = 0 # only continue if component is enabled if component_status == 1: # Handle collections for object_name in collections_map_per_component[component]: # # Verify that the KV collection exists # kvstore_collection_name = ( f"kv_{object_name}_tenant_{tenant_id}" ) kvstore_collection_exists = ( True # assume the collection exists ) # check if the collection exists try: collection = self.service.kvstore[ kvstore_collection_name ] logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}' ) tenant_id_checked_status_dict[ kvstore_collection_name ] = { "result": "success", "type": "kvstore_collection", } except Exception as e: kvstore_collection_exists = False logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, failed to retrieve the collection, collection_name="{kvstore_collection_name}", exception="{str(e)}"' ) tenant_id_checked_status_dict[ kvstore_collection_name ] = { "result": "failure", "exception": str(e), "type": "kvstore_collection", } # # Verify that the transform exists and contains the expected fields # transform_name = f"{object_name}_tenant_{tenant_id}" transform_exists = True # assume the transform exists transforms_fields_list_csv = None transforms_fields_list = None transforms_expected_fields_list_csv = collections_dict[ object_name ] transforms_expected_fields_list = [ x.strip() for x in transforms_expected_fields_list_csv.split(",") ] transforms_has_missing_fields = False # assume False # check if the transform exists logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, inspecting transform_name={transform_name}' ) try: transform = self.service.confs["transforms"][ transform_name ] transforms_fields_list_csv = transform["fields_list"] transforms_fields_list = ( [ x.strip() for x in transforms_fields_list_csv.split(",") ] if transforms_fields_list_csv else [] ) # Verify that the transforms has at the minimum the expected fields for expected_field in transforms_expected_fields_list: if expected_field not in transforms_fields_list: transforms_has_missing_fields = True logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_fields_list={transforms_fields_list}, transforms_has_missing_fields={transforms_has_missing_fields}' ) except Exception as e: transform_exists = False logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, failed to retrieve the transform, transform_name="{transform_name}", exception="{str(e)}"' ) # temp logging logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, kvstore_collection_exists={kvstore_collection_exists}, transform_exists={transform_exists}, transform_fields_list={transforms_fields_list}, transforms_has_missing_fields={transforms_has_missing_fields}' ) # # Take action if needed # # # KVstore collection # # If the KVstore collection does not exist, create it if not kvstore_collection_exists: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, it will be created.' ) # create the KVstore collection url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/create_kvcollection' data = { "tenant_id": tenant_id, "collection_name": kvstore_collection_name, "collection_acl": { "owner": tenant_owner, "sharing": trackme_default_sharing, "perms.write": tenant_roles_write_perms, "perms.read": tenant_roles_read_perms, }, "owner": tenant_owner, } try: response = requests.post( url, headers={ "Authorization": f"Splunk {self._metadata.searchinfo.session_key}" }, data=json.dumps(data), verify=False, timeout=600, ) response.raise_for_status() logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, it has been created successfully.' ) # add to auto_repair_actions_list auto_repair_actions_list.append( { "action": "create_kvcollection", "tenant_id": tenant_id, "component": component, "collection_name": kvstore_collection_name, "message": "KVstore collection was detected missing, it has been created successfully.", } ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, attempt to create it has failed, exception="{str(e)}"' ) # add to auto_repair_actions_list auto_repair_actions_list.append( { "action": "create_kvcollection", "tenant_id": tenant_id, "component": component, "collection_name": kvstore_collection_name, "message": "KVstore collection was detected missing, attempt to create it has failed.", "exception": str(e), } ) # # Transforms definition: If the transforms does not exist, create it, if it exists but has missing fields, it will be deleted and recreated # if transform_exists and not transforms_has_missing_fields: tenant_id_checked_status_dict[transform_name] = { "result": "success", "type": "transform", } elif not transform_exists: tenant_id_checked_status_dict[transform_name] = { "result": "failure", "exception": "The transform was detected missing.", "type": "transform", } elif transform_exists and transforms_has_missing_fields: tenant_id_checked_status_dict[transform_name] = { "result": "failure", "exception": "The transform was detected as existing but has missing fields.", "type": "transform", } else: tenant_id_checked_status_dict[transform_name] = { "result": "unknown", "transform_exists": transform_exists, "transforms_has_missing_fields": transforms_has_missing_fields, "type": "transform", } if not transform_exists or transforms_has_missing_fields: if not transform_exists: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it will be created.' ) if transform_exists and transforms_has_missing_fields: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it will be recreated.' ) # # delete the transform # url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/delete_kvtransform' data = { "tenant_id": tenant_id, "transform_name": transform_name, } try: response = requests.post( url, headers={ "Authorization": f"Splunk {self._metadata.searchinfo.session_key}" }, data=json.dumps(data), verify=False, timeout=600, ) response.raise_for_status() logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it has been deleted successfully.' ) # add to auto_repair_actions_list auto_repair_actions_list.append( { "action": "delete_kvtransform", "tenant_id": tenant_id, "component": component, "transform_name": transform_name, "message": "The transform was detected as existing but has missing fields, it has been deleted successfully.", } ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it has been deleted successfully.' ) # add to auto_repair_actions_list auto_repair_actions_list.append( { "action": "delete_kvtransform", "tenant_id": tenant_id, "component": component, "transform_name": transform_name, "message": "The transform was detected as existing but has missing fields, attempt to delete it has failed.", "exception": str(e), } ) # # create the transform # url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/create_kvtransform' data = { "tenant_id": tenant_id, "transform_name": transform_name, "transform_fields": transforms_expected_fields_list_csv, "collection_name": kvstore_collection_name, "transform_acl": { "owner": tenant_owner, "sharing": trackme_default_sharing, "perms.write": tenant_roles_write_perms, "perms.read": tenant_roles_read_perms, }, "owner": tenant_owner, } try: response = requests.post( url, headers={ "Authorization": f"Splunk {self._metadata.searchinfo.session_key}" }, data=json.dumps(data), verify=False, timeout=600, ) response.raise_for_status() logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it has been created successfully.' ) # add to auto_repair_actions_list auto_repair_actions_list.append( { "action": "create_kvtransform", "tenant_id": tenant_id, "component": component, "transform_name": transform_name, "message": "The transform was detected missing or inconsistent, it has been created successfully.", } ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it has been created successfully.' ) # add to auto_repair_actions_list auto_repair_actions_list.append( { "action": "create_kvtransform", "tenant_id": tenant_id, "component": component, "transform_name": transform_name, "message": "The transform was detected missing or inconsistent, attempt to create it has failed.", "exception": str(e), } ) # add to tenants_objects_status_dict tenants_objects_status_dict[tenant_id] = tenant_id_checked_status_dict # add to global_results_dict global_results_dict[f"{task_name}"] = { "knowledge_objects_status": tenants_objects_status_dict, "auto_repair_actions_list": ( auto_repair_actions_list if auto_repair_actions_list else "No actions to perform." ), } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # TrackMe Virtual Tenants configuration issues fixer # Goals: # - Run a Splunk search to identify Virtual Tenants with configuration issues (missing reports) # - For each tenant found, identify enabled components from the central KVstore collection # - For each tenant/component combination, run the REST API call to fix issues # - Exclude replica tenants (tenant_replica = 1) ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "virtual_tenants:auto-repair:components_reports" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # Step 1: Run Splunk search to identify tenants with missing reports search = remove_leading_spaces( """ search (index=_internal sourcetype=trackme:rest_api) OR (index=_internal sourcetype=trackme:custom_commands:*) log_level=error task="optimize_tenant_scheduled_reports" "failure to get report report_name" "urlencoded" | stats count by tenant_id """) kwargs_oneshot = { "earliest_time": "-24h", "latest_time": "now", "output_mode": "json", "count": 0, } # Counters tenants_with_issues_found = 0 tenants_processed = 0 tenants_fixed = 0 tenants_skipped = 0 total_components_fixed = 0 total_components_failed = 0 # Lists to store detailed information tenants_with_issues = [] tenants_processed_details = [] rest_call_responses = [] try: reader = run_splunk_search( self.service, search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): tenant_id = item.get("tenant_id") error_count = item.get("count", 0) if tenant_id: tenants_with_issues.append({ "tenant_id": tenant_id, "error_count": error_count }) tenants_with_issues_found += 1 except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to run search for tenants with issues, exception="{str(e)}"' ) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, found {tenants_with_issues_found} tenants with configuration issues' ) # Step 2: Process each tenant found for tenant_info in tenants_with_issues: tenant_id = tenant_info["tenant_id"] error_count = tenant_info["error_count"] tenants_processed += 1 # Initialize tenant processing details tenant_processing_detail = { "tenant_id": tenant_id, "error_count": error_count, "enabled_components": [], "components_fixed": 0, "components_failed": 0, "is_replica": False, "skipped_reason": None, "processing_status": "processing" } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing tenant {tenant_id} with {error_count} errors' ) # Get tenant record from central KVstore collection try: # Find the tenant record in vtenant_records (already loaded) tenant_record = None for vtenant_record in vtenant_records: if vtenant_record.get("tenant_id") == tenant_id: tenant_record = vtenant_record break if not tenant_record: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, tenant record not found in central collection, skipping' ) tenant_processing_detail["skipped_reason"] = "tenant record not found in central collection" tenant_processing_detail["processing_status"] = "skipped" tenants_processed_details.append(tenant_processing_detail) tenants_skipped += 1 continue # Check if tenant is a replica (exclude if so) try: tenant_replica = int(tenant_record.get("tenant_replica", 0)) except Exception as e: tenant_replica = 0 tenant_processing_detail["is_replica"] = (tenant_replica == 1) if tenant_replica == 1: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping' ) tenant_processing_detail["skipped_reason"] = "replica tenant" tenant_processing_detail["processing_status"] = "skipped" tenants_processed_details.append(tenant_processing_detail) tenants_skipped += 1 continue # Get enabled components for this tenant enabled_components = [] component_fields = { "dsm": "tenant_dsm_enabled", "dhm": "tenant_dhm_enabled", "mhm": "tenant_mhm_enabled", "flx": "tenant_flx_enabled", "wlk": "tenant_wlk_enabled", "fqm": "tenant_fqm_enabled" } for component, field_name in component_fields.items(): try: if int(tenant_record.get(field_name, 0)) == 1: enabled_components.append(component) except (ValueError, TypeError): continue tenant_processing_detail["enabled_components"] = enabled_components if not enabled_components: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, has no enabled components, skipping' ) tenant_processing_detail["skipped_reason"] = "no enabled components" tenant_processing_detail["processing_status"] = "skipped" tenants_processed_details.append(tenant_processing_detail) tenants_skipped += 1 continue logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, has enabled components: {enabled_components}' ) # Step 3: Fix each enabled component tenant_components_fixed = 0 tenant_components_failed = 0 for component in enabled_components: try: # Prepare the REST API call target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/check_component_tenant" payload = { "tenant_id": tenant_id, "component_target": component, "update_comment": f"Automated fix for missing reports - general health manager task" } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, fixing component {component}' ) # Make the REST call response = requests.post( target_url, headers=header, data=json.dumps(payload), verify=False, timeout=600 ) # Store REST call response details rest_call_response = { "tenant_id": tenant_id, "component": component, "status_code": response.status_code, "success": response.status_code == 200, "response_text": response.text, "timestamp": time.time() } # Try to parse JSON response if possible try: rest_call_response["response_json"] = response.json() except: rest_call_response["response_json"] = None rest_call_responses.append(rest_call_response) if response.status_code == 200: tenant_components_fixed += 1 total_components_fixed += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, successfully fixed' ) else: tenant_components_failed += 1 total_components_failed += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, failed to fix: {response.status_code} - {response.text}' ) except Exception as e: tenant_components_failed += 1 total_components_failed += 1 # Store exception details rest_call_response = { "tenant_id": tenant_id, "component": component, "status_code": None, "success": False, "response_text": str(e), "timestamp": time.time(), "exception": True } rest_call_responses.append(rest_call_response) logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, exception during fix: {str(e)}' ) if tenant_components_fixed > 0: tenants_fixed += 1 # Update tenant processing details tenant_processing_detail["components_fixed"] = tenant_components_fixed tenant_processing_detail["components_failed"] = tenant_components_failed tenant_processing_detail["processing_status"] = "completed" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, completed: {tenant_components_fixed} components fixed, {tenant_components_failed} components failed' ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, exception during processing: {str(e)}' ) tenant_processing_detail["skipped_reason"] = f"exception during processing: {str(e)}" tenant_processing_detail["processing_status"] = "error" tenants_skipped += 1 # Always add the tenant processing detail to the list tenants_processed_details.append(tenant_processing_detail) # add to global_results_dict global_results_dict[f"{task_name}"] = { "tenants_with_issues_found": tenants_with_issues_found, "tenants_processed": tenants_processed, "tenants_fixed": tenants_fixed, "tenants_skipped": tenants_skipped, "total_components_fixed": total_components_fixed, "total_components_failed": total_components_failed, "tenants_with_issues": tenants_with_issues, "tenants_processed_details": tenants_processed_details, "rest_call_responses": rest_call_responses, "result": f"{tenants_with_issues_found} tenants with issues found, {tenants_processed} processed, {tenants_fixed} fixed, {tenants_skipped} skipped, {total_components_fixed} components fixed, {total_components_failed} components failed", } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # TrackMe Virtual Tenants Check Health Tracker # Goals: # - If the tenant is enabled, then the health tracker should be enabled and scheduled. # - If the tenant is disabled, then the health tracker should be disabled. ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "virtual_tenants:check_health_tracker" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) def manage_savedsearch_schedule( tenant_id, savedsearch_names, feature_enabled, feature_name ): """ Helper function to manage saved search scheduling based on feature enablement. Args: savedsearch_names: List of saved search names to manage feature_enabled: Boolean indicating if the feature should be enabled feature_name: String name of the feature for logging purposes """ for savedsearch_name in savedsearch_names: # get the status of the savedsearch savedsearch_properties, savedsearch_acl = ( trackme_manage_report_schedule( logging, self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, tenant_id, savedsearch_name, action="status", ) ) # log logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", savedsearch_properties="{json.dumps(savedsearch_properties, indent=2)}", savedsearch_acl="{json.dumps(savedsearch_acl, indent=2)}"' ) # get the disabled status disabled = int(savedsearch_properties.get("disabled", 0)) # get the is_scheduled status is_scheduled = int(savedsearch_properties.get("is_scheduled", 0)) # Check tenant status first - if tenant is disabled, ensure health tracker is disabled if feature_enabled == False: # Tenant is disabled - ensure health tracker is disabled (but keep it scheduled) if disabled == 0: # Report is enabled - disable it logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is disabled, disabling savedsearch.' ) try: trackme_report_update_enablement( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, tenant_id, savedsearch_name, "disable", ) logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch disabled successfully.' ) return { "action": "disable_savedsearch", "tenant_id": tenant_id, "savedsearch_name": savedsearch_name, "message": "The savedsearch has been disabled successfully.", } except Exception as e: logging.error( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to disable savedsearch, exception="{str(e)}"' ) else: # Report is already disabled - nothing to do logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is disabled and savedsearch is already disabled, nothing to do.' ) return { "action": "nothing_to_do", "tenant_id": tenant_id, "savedsearch_name": savedsearch_name, "message": "Tenant is disabled and savedsearch is already disabled, nothing to do.", } # Tenant is enabled - ensure health tracker is enabled AND scheduled elif feature_enabled == True: # Track if we performed any actions action_performed = False action_message = "" # Check if we need to enable the report if disabled == 1: logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", the report is currently disabled and needs to be enabled.' ) try: trackme_report_update_enablement( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, tenant_id, savedsearch_name, "enable", ) logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch enabled successfully.' ) action_performed = True action_message = "The savedsearch has been enabled successfully" except Exception as e: logging.error( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to enable the savedsearch, exception="{str(e)}"' ) # stop here if we had an exception enabling the savedsearch continue # Check if we need to schedule the report if is_scheduled == 0: logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", the report needs to be scheduled.' ) try: savedsearch_properties, savedsearch_acl = ( trackme_manage_report_schedule( logging, self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, tenant_id, savedsearch_name, action="enable", ) ) logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch scheduled successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"' ) action_performed = True if action_message: action_message += " and scheduled successfully." else: action_message = "The savedsearch has been scheduled successfully." except Exception as e: logging.error( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to schedule savedsearch, exception="{str(e)}"' ) # Return appropriate result based on actions performed if action_performed: return { "action": "enable_savedsearch", "tenant_id": tenant_id, "savedsearch_name": savedsearch_name, "message": action_message, } else: # Report is already enabled and scheduled - nothing to do logging.info( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is enabled and savedsearch is already enabled and scheduled, nothing to do.' ) return { "action": "nothing_to_do", "tenant_id": tenant_id, "savedsearch_name": savedsearch_name, "message": "Tenant is enabled and savedsearch is already enabled and scheduled, nothing to do.", } else: # This should not happen as we've covered all cases above logging.warning( f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", unexpected state, nothing to do.' ) return { "action": "nothing_to_do", "tenant_id": tenant_id, "savedsearch_name": savedsearch_name, "message": "Unexpected state, nothing to do.", } # init auto_repair_actions_list auto_repair_actions_list = [] # A dict to store objects that were verified and their status, per tenant_id as the key tenants_objects_status_dict = {} for vtenant_record in vtenant_records: # get the tenant_id tenant_id = vtenant_record.get("tenant_id") # get the tenant_status (enabled/disabled) tenant_status = vtenant_record.get("tenant_status", "enabled") # check if tenant is a replica tenant, if so, skip it if vtenant_record.get("tenant_replica", 0) == 1: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.' ) continue # init health_tracker_report_name health_tracker_report_name = ( f"trackme_health_tracker_tenant_{tenant_id}" ) health_tracker_check_result = {} try: # Determine if health tracker should be enabled based on tenant status health_tracker_enabled = (tenant_status == "enabled") health_tracker_check_result = manage_savedsearch_schedule( tenant_id, [health_tracker_report_name], health_tracker_enabled, "health_tracker" ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # add to tenants_objects_status_dict tenants_objects_status_dict[tenant_id] = health_tracker_check_result # add to global_results_dict global_results_dict[f"{task_name}"] = { "health_tracker_check_result": tenants_objects_status_dict, } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # TrackMe Virtual Tenants stateful alerts records expiration # Goals: # - For each enable Virtual Tenant, search for closed stateful alerts records # in the KVstore collection, and delete them if they are older than 30 days. # When purging statefule alerts records, search and purge associated charts records. (if any) ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "virtual_tenants:stateful_alerts_records_expiration" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # get the stateful records expiration days stateful_records_expiration_days = int( reqinfo["trackme_conf"]["trackme_general"][ "trackme_stateful_records_expiration_days" ] ) # counters expired_statefule_records_deleted_count = 0 expired_associated_charts_records_deleted_count = 0 orphans_charts_records_deleted_count = 0 for vtenant_record in vtenant_records: # get the tenant_id tenant_id = vtenant_record.get("tenant_id") # check if tenant is a replica tenant, if so, skip it if vtenant_record.get("tenant_replica", 0) == 1: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.' ) continue # Define the query search = remove_leading_spaces( f""" | inputlookup trackme_stateful_alerting_tenant_{tenant_id} where alert_status="closed" | eval keyid=_key | eval record_age=now()-ctime, is_expired=if(record_age > 86400*{stateful_records_expiration_days}, 1, 0) | where is_expired=1 | table keyid, incident_id """ ) # A list to stored expired incident_id expired_incident_id_list = [] # A list to store expired records expired_records_list = [] # A list to store expired associated charts records expired_associated_charts_records_list = [] # A list to store orphans charts records orphans_charts_records = [] # Run the search logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"' ) try: reader = run_splunk_search( self.service, search, { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, }, 24, 5, ) for item in reader: if isinstance(item, dict): expired_records_list.append(item.get("keyid")) expired_incident_id_list.append(item.get("incident_id")) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, records for incident_id={item.get("incident_id")} have been detected as expired and will be deleted from the KVstore collections, keyid={item.get("keyid")}' ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of expired records, exception="{str(e)}"' ) # If nothing to do, continue if not expired_records_list: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no expired records to process, skipping.' ) continue else: # Run a new search to retrieve the list of associated charts records # Convert the list to a CSV string filtered expired_records_filtered_csv = ( f"({','.join(expired_incident_id_list)})" ) search = remove_leading_spaces( f""" | inputlookup trackme_stateful_alerting_charts_tenant_{tenant_id} where incident_id IN {expired_records_filtered_csv} | eval keyid=_key | table keyid, incident_id """ ) # Run the search logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"' ) try: reader = run_splunk_search( self.service, search, { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, }, 24, 5, ) for item in reader: if isinstance(item, dict): expired_associated_charts_records_list.append( item.get("keyid") ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of associated charts records, exception="{str(e)}"' ) # Run a new search to retrieve the list of orphans charts records search = remove_leading_spaces( f""" | inputlookup trackme_stateful_alerting_charts_tenant_{tenant_id} | eval keyid=_key | lookup trackme_stateful_alerting_tenant_{tenant_id} incident_id AS incident_id OUTPUT incident_id as parent_incident_id | where (isnull(parent_incident_id) OR parent_incident_id="") | table keyid, incident_id """ ) # Run the search logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"' ) try: reader = run_splunk_search( self.service, search, { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, }, 24, 5, ) for item in reader: if isinstance(item, dict): orphans_charts_records.append(item.get("keyid")) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of orphans charts records, exception="{str(e)}"' ) # Purge expired records from the stateful collection, if any if expired_records_list: # connect to the collection collection_stateful_alerting_name = ( f"kv_trackme_stateful_alerting_tenant_{tenant_id}" ) collection_stateful_alerting = self.service.kvstore[ collection_stateful_alerting_name ] # for each expired record, delete the record from the stateful collection for expired_record in expired_records_list: try: # Remove the record collection_stateful_alerting.data.delete( json.dumps({"_key": expired_record}) ) expired_statefule_records_deleted_count += 1 except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"' ) # Purge expired associated charts records from the stateful collection, if any if expired_associated_charts_records_list: # connect to the collection collection_stateful_alerting_name = ( f"kv_trackme_stateful_alerting_charts_tenant__{tenant_id}" ) # for each expired record, delete the record from the stateful collection for expired_record in expired_associated_charts_records_list: try: # Remove the record collection_stateful_alerting.data.delete( json.dumps({"_key": expired_record}) ) expired_associated_charts_records_deleted_count += 1 except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"' ) # Purge orphans charts records from the stateful collection, if any if orphans_charts_records: # connect to the collection collection_stateful_alerting_name = ( f"kv_trackme_stateful_alerting_charts_tenant__{tenant_id}" ) # for each expired record, delete the record from the stateful collection for orphan_record in orphans_charts_records: try: # Remove the record collection_stateful_alerting.data.delete( json.dumps({"_key": orphan_record}) ) expired_associated_charts_records_deleted_count += 1 except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"' ) # add to global_results_dict global_results_dict[ f"{task_name}" ] = { "expired_statefule_records_deleted_count": expired_statefule_records_deleted_count, "expired_associated_charts_records_deleted_count": expired_associated_charts_records_deleted_count, "orphans_charts_records_deleted_count": orphans_charts_records_deleted_count, } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # TrackMe Virtual Tenants stateful alerts duplicate opened incidents cleanup # Goals: # - For each enabled Virtual Tenant, verify that for a given object_id, # there should not be more than one opened incident (alert_status="opened") in the KVstore # - If there are more than one incident_id for the same object_id, keep only the latest # (based on the field mtime which is the epochtime of the last modification of the incident_id), # other records should be updated with alert_status="closed" ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = ( "virtual_tenants:stateful_alerts_duplicate_opened_incidents_cleanup" ) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # counters duplicate_opened_incidents_found_count = 0 duplicate_opened_incidents_resolved_count = 0 duplicate_opened_incidents_resolution_failures_count = 0 for vtenant_record in vtenant_records: # get the tenant_id tenant_id = vtenant_record.get("tenant_id") # check if tenant is a replica tenant, if so, skip it if vtenant_record.get("tenant_replica", 0) == 1: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.' ) continue # Define the query to find duplicate opened incidents for the same object_id search = remove_leading_spaces( f""" | inputlookup trackme_stateful_alerting_tenant_{tenant_id} where alert_status="opened" | eval keyid=_key | eval _time=mtime | stats count as incident_count, values(incident_id) as incident_ids, latest(incident_id) as latest_incident_id, latest(keyid) as latest_keyid, values(keyid) as keyids, max(mtime) as max_mtime by object_id | where incident_count > 1 | eval to_close_keyids=mvmap(keyids, if(mvfind(keyids, "^\\\\"" + latest_keyid + "\\$")=0, null(), keyids)) | eval to_close_incident_ids=mvmap(incident_ids, if(mvfind(incident_ids, "^\\\\"" + latest_incident_id + "\\$")=0, null(), incident_ids)) | table object_id, incident_count, latest_incident_id, latest_keyid, max_mtime, to_close_keyids, to_close_incident_ids """ ) # A list to store duplicate opened incidents data duplicate_opened_incidents_list = [] # Run the search logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"' ) try: reader = run_splunk_search( self.service, search, { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, }, 24, 5, ) for item in reader: if isinstance(item, dict): duplicate_opened_incidents_list.append(item) duplicate_opened_incidents_found_count += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, found duplicate opened incidents for object_id={item.get("object_id")}, incident_count={item.get("incident_count")}, latest_incident_id={item.get("latest_incident_id")}, latest_keyid={item.get("latest_keyid")}, max_mtime={item.get("max_mtime")}' ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of duplicate opened incidents, exception="{str(e)}"' ) # If nothing to do, continue if not duplicate_opened_incidents_list: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no duplicate opened incidents to process, skipping.' ) continue # Process duplicate opened incidents for duplicate_incident in duplicate_opened_incidents_list: object_id = duplicate_incident.get("object_id") to_close_keyids = duplicate_incident.get("to_close_keyids", []) # if not a list, turn into a list csv if not isinstance(to_close_keyids, list): to_close_keyids = to_close_keyids.split(",") to_close_incident_ids = duplicate_incident.get( "to_close_incident_ids", [] ) # if not a list, turn into a list csv if not isinstance(to_close_incident_ids, list): to_close_incident_ids = to_close_incident_ids.split(",") # Parse the to_close_keyids and to_close_incident_ids if to_close_keyids: to_close_keyids_list = [ keyid.strip() for keyid in to_close_keyids if keyid.strip() ] else: to_close_keyids_list = [] if to_close_incident_ids: to_close_incident_ids_list = [ incident_id.strip() for incident_id in to_close_incident_ids if incident_id.strip() ] else: to_close_incident_ids_list = [] logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, will close {len(to_close_keyids_list)} duplicate incidents: keyids={to_close_keyids_list}, incident_ids={to_close_incident_ids_list}' ) # Connect to the collection collection_stateful_alerting_name = ( f"kv_trackme_stateful_alerting_tenant_{tenant_id}" ) collection_stateful_alerting = self.service.kvstore[ collection_stateful_alerting_name ] # Update each duplicate incident to closed status for keyid in to_close_keyids_list: try: # Get the current record record_list = collection_stateful_alerting.data.query( query=json.dumps({"_key": keyid}) ) if record_list and len(record_list) > 0: # Extract the first (and should be only) record from the list record = record_list[0] # Update the record to closed status record["alert_status"] = "closed" record["mtime"] = int( time.time() ) # Update modification time # Update the record in the collection collection_stateful_alerting.data.update( keyid, json.dumps(record) ) duplicate_opened_incidents_resolved_count += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, successfully closed duplicate incident with keyid={keyid}' ) else: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, record with keyid={keyid} not found in collection' ) except Exception as e: duplicate_opened_incidents_resolution_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, failed to close duplicate incident with keyid={keyid}, exception="{str(e)}"' ) # add to global_results_dict global_results_dict[ f"{task_name}" ] = { "duplicate_opened_incidents_found_count": duplicate_opened_incidents_found_count, "duplicate_opened_incidents_resolved_count": duplicate_opened_incidents_resolved_count, "duplicate_opened_incidents_resolution_failures_count": duplicate_opened_incidents_resolution_failures_count, "result": f"{duplicate_opened_incidents_found_count} duplicate opened incidents found, {duplicate_opened_incidents_resolved_count} resolved successfully, {duplicate_opened_incidents_resolution_failures_count} resolution failures", } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # TrackMe Virtual Tenants stateful charts records expiration # Goals: # - For each Virtual tenant, purge any record in the stateful charts KVstore collection: # trackme_stateful_alerting_charts_tenant_ # - For each KVrecord which is equal or older to 48 hours, based on the field "ctime" # of the record which contains the epochtime of its creation ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "virtual_tenants:stateful_charts_records_expiration" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # get the stateful records expiration days stateful_records_expiration_days = int( reqinfo["trackme_conf"]["trackme_general"][ "trackme_stateful_charts_records_expiration_days" ] ) # Define the expiration threshold (based on the expiration days) charts_records_expiration_seconds = stateful_records_expiration_days * 24 * 3600 current_time = time.time() # counters expired_charts_records_deleted_count = 0 expired_charts_records_deletion_failures_count = 0 for vtenant_record in vtenant_records: # get the tenant_id tenant_id = vtenant_record.get("tenant_id") # check if tenant is a replica tenant, if so, skip it if vtenant_record.get("tenant_replica", 0) == 1: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.' ) continue # connect to the stateful charts collection collection_stateful_charts_name = ( f"kv_trackme_stateful_alerting_charts_tenant_{tenant_id}" ) try: collection_stateful_charts = self.service.kvstore[collection_stateful_charts_name] # get all records from the collection ( charts_records, charts_collection_keys, charts_collection_dict, ) = get_full_kv_collection( collection_stateful_charts, collection_stateful_charts_name ) # A list to store expired records to delete expired_charts_records_list = [] # Process each record to check if it's older than 48 hours for record in charts_records: try: # Get the ctime field and convert from string to float if needed ctime_str = record.get("ctime") if ctime_str is None: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, record with key={record.get("_key")} has no ctime field, skipping.' ) continue # Convert string to float if needed if isinstance(ctime_str, str): ctime_float = float(ctime_str) else: ctime_float = float(ctime_str) # Calculate age in seconds record_age_seconds = current_time - ctime_float # Check if record is older than 48 hours if record_age_seconds >= charts_records_expiration_seconds: expired_charts_records_list.append(record.get("_key")) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, record with key={record.get("_key")} is {round(record_age_seconds/3600, 2)} hours old (>= 48 hours), will be deleted.' ) except (ValueError, TypeError) as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to process ctime field for record with key={record.get("_key")}, ctime="{ctime_str}", exception="{str(e)}"' ) continue # If no expired records, continue to next tenant if not expired_charts_records_list: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no expired charts records to process, skipping.' ) continue # Delete expired records from the collection for expired_record_key in expired_charts_records_list: try: # Remove the record collection_stateful_charts.data.delete( json.dumps({"_key": expired_record_key}) ) expired_charts_records_deleted_count += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, successfully deleted expired charts record with key={expired_record_key}' ) except Exception as e: expired_charts_records_deletion_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete expired charts record with key={expired_record_key}, exception="{str(e)}"' ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to access collection="{collection_stateful_charts_name}", exception="{str(e)}"' ) # add to global_results_dict global_results_dict[f"{task_name}"] = { "expired_charts_records_deleted_count": expired_charts_records_deleted_count, "expired_charts_records_deletion_failures_count": expired_charts_records_deletion_failures_count, "result": f"{expired_charts_records_deleted_count} expired charts records deleted, {expired_charts_records_deletion_failures_count} deletion failures", } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ############################################################ # Recurring Bank Holidays Management # Goals: # - Process recurring bank holidays and create future occurrences # - Handle holidays that span across years (e.g., Dec 31 - Jan 1) # - Clean up past bank holiday periods that have already ended ############################################################ task_start = time.time() task_instance_id = self.get_uuid() task_name = "bank-holidays:recurring_periods_management" logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # counters recurring_holidays_processed_count = 0 new_periods_created_count = 0 periods_creation_failures_count = 0 past_periods_deleted_count = 0 past_periods_deletion_failures_count = 0 try: # Connect to bank holidays collection collection_name = "kv_trackme_bank_holidays" collection = self.service.kvstore[collection_name] # Get current time and year current_time = time.time() current_year = datetime.datetime.fromtimestamp(current_time, tz=datetime.timezone.utc).year ############################################################ # Step 1: Clean up past bank holiday periods # Strategy: # - For recurring holidays: Keep the oldest one for each pattern (template), delete other past duplicates # - For non-recurring holidays: Delete all that are past ############################################################ logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting cleanup of past bank holiday periods.' ) try: # Get all bank holidays all_holidays = collection.data.query() # Group recurring holidays by pattern to identify templates recurring_by_pattern = {} non_recurring_past = [] for holiday in all_holidays: holiday_dict = dict(holiday) holiday_key = holiday_dict.get("_key") end_date_epoch = holiday_dict.get("end_date") is_recurring = holiday_dict.get("is_recurring", False) if not end_date_epoch: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, holiday with key={holiday_key} has no end_date, skipping cleanup check.' ) continue # Check if period has already passed if int(end_date_epoch) < int(current_time): if is_recurring: # For recurring holidays, group by pattern to keep templates period_name = holiday_dict.get("period_name", "") country_code = holiday_dict.get("country_code", "") start_date_epoch = holiday_dict.get("start_date") if start_date_epoch: try: start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc) end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc) # Create pattern key pattern_key = f"{period_name}|{country_code}|{start_dt.month:02d}-{start_dt.day:02d}|{end_dt.month:02d}-{end_dt.day:02d}" if pattern_key not in recurring_by_pattern: recurring_by_pattern[pattern_key] = [] recurring_by_pattern[pattern_key].append({ "key": holiday_key, "time_created": holiday_dict.get("time_created", 0), "holiday": holiday_dict }) except Exception as e: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to parse dates for holiday key={holiday_key}, exception="{str(e)}", treating as non-recurring for cleanup.' ) non_recurring_past.append(holiday_dict) else: # Missing start_date, treat as non-recurring non_recurring_past.append(holiday_dict) else: # Non-recurring past holiday - mark for deletion non_recurring_past.append(holiday_dict) # Delete non-recurring past holidays via REST API for holiday_dict in non_recurring_past: holiday_key = holiday_dict.get("_key") period_name = holiday_dict.get("period_name", "unknown") try: end_date_epoch = holiday_dict.get("end_date") end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc) end_date_str = end_dt.strftime("%Y-%m-%d %H:%M:%S") # Use REST API endpoint for deletion (enables auditing) target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/delete" payload = {"_key": holiday_key} response = requests.post( target_url, headers=header, data=json.dumps(payload), verify=False, timeout=600 ) if response.status_code == 200: past_periods_deleted_count += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, deleted past non-recurring bank holiday via REST API: key={holiday_key}, period_name="{period_name}", end_date={end_date_str}' ) else: past_periods_deletion_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete past non-recurring bank holiday via REST API: key={holiday_key}, period_name="{period_name}", status_code={response.status_code}, response={response.text}' ) except Exception as e: past_periods_deletion_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete past non-recurring bank holiday: key={holiday_key}, period_name="{period_name}", exception="{str(e)}"' ) # For recurring holidays, keep the oldest one (template) for each pattern, delete others for pattern_key, holidays_list in recurring_by_pattern.items(): if len(holidays_list) > 1: # Sort by time_created (oldest first) - keep the first one as template holidays_list.sort(key=lambda x: x.get("time_created", 0)) template = holidays_list[0] duplicates = holidays_list[1:] logging.debug( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, pattern="{pattern_key}" has {len(holidays_list)} past occurrences, keeping template (key={template["key"]}), deleting {len(duplicates)} duplicates.' ) # Delete duplicate past periods via REST API for duplicate in duplicates: duplicate_key = duplicate["key"] duplicate_holiday = duplicate["holiday"] period_name = duplicate_holiday.get("period_name", "unknown") try: end_date_epoch = duplicate_holiday.get("end_date") end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc) end_date_str = end_dt.strftime("%Y-%m-%d %H:%M:%S") # Use REST API endpoint for deletion (enables auditing) target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/delete" payload = {"_key": duplicate_key} response = requests.post( target_url, headers=header, data=json.dumps(payload), verify=False, timeout=600 ) if response.status_code == 200: past_periods_deleted_count += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, deleted duplicate past recurring bank holiday via REST API: key={duplicate_key}, period_name="{period_name}", pattern="{pattern_key}", end_date={end_date_str}' ) else: past_periods_deletion_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete duplicate past recurring bank holiday via REST API: key={duplicate_key}, period_name="{period_name}", status_code={response.status_code}, response={response.text}' ) except Exception as e: past_periods_deletion_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete duplicate past recurring bank holiday: key={duplicate_key}, period_name="{period_name}", exception="{str(e)}"' ) # If only one past occurrence, keep it as template (no deletion needed) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, cleanup completed: {past_periods_deleted_count} past periods deleted, {past_periods_deletion_failures_count} deletion failures.' ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed during cleanup of past bank holidays, exception="{str(e)}"' ) ############################################################ # Step 2: Process recurring holidays and create future occurrences # Ensure we have periods for current year + next year (year+1) ############################################################ # Get all recurring bank holidays (after cleanup) query_recurring = json.dumps({"is_recurring": True}) recurring_holidays = collection.data.query(query=query_recurring) if not recurring_holidays: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no recurring bank holidays found, skipping creation task.' ) else: logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, found {len(recurring_holidays)} recurring bank holiday(s) to process.' ) # Plan ahead for current year + next year (year+1) # This ensures we always have periods for the current year (if not yet passed) and next year years_to_check = [current_year, current_year + 1] # Get all existing bank holidays to check for duplicates all_existing_holidays = collection.data.query() existing_periods_by_pattern = {} # Group existing holidays by pattern (period_name + country_code + month/day) for holiday in all_existing_holidays: holiday_dict = dict(holiday) period_name = holiday_dict.get("period_name", "") country_code = holiday_dict.get("country_code", "") start_date_epoch = holiday_dict.get("start_date") end_date_epoch = holiday_dict.get("end_date") if start_date_epoch and end_date_epoch: start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc) end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc) # Create a pattern key: period_name + country_code + month/day of start and end pattern_key = f"{period_name}|{country_code}|{start_dt.month:02d}-{start_dt.day:02d}|{end_dt.month:02d}-{end_dt.day:02d}" if pattern_key not in existing_periods_by_pattern: existing_periods_by_pattern[pattern_key] = [] existing_periods_by_pattern[pattern_key].append({ "year": start_dt.year, "record": holiday_dict }) # Process each recurring holiday for recurring_holiday in recurring_holidays: recurring_holidays_processed_count += 1 holiday_dict = dict(recurring_holiday) period_name = holiday_dict.get("period_name", "") country_code = holiday_dict.get("country_code", "") comment = holiday_dict.get("comment", "") start_date_epoch = holiday_dict.get("start_date") end_date_epoch = holiday_dict.get("end_date") src_user = holiday_dict.get("src_user", "system") if not start_date_epoch or not end_date_epoch: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recurring holiday with key={holiday_dict.get("_key")} has invalid dates, skipping.' ) continue # Parse original dates try: start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc) end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to parse dates for recurring holiday key={holiday_dict.get("_key")}, exception="{str(e)}"' ) continue # Extract month/day from original dates start_month = start_dt.month start_day = start_dt.day start_hour = start_dt.hour start_minute = start_dt.minute end_month = end_dt.month end_day = end_dt.day end_hour = end_dt.hour end_minute = end_dt.minute # Create pattern key for this recurring holiday pattern_key = f"{period_name}|{country_code}|{start_month:02d}-{start_day:02d}|{end_month:02d}-{end_day:02d}" # Check which years already have this holiday # We check both by year and by actual date range to be more robust existing_years = set() existing_date_ranges = {} # year -> list of (start_epoch, end_epoch) tuples if pattern_key in existing_periods_by_pattern: for existing_period in existing_periods_by_pattern[pattern_key]: year = existing_period["year"] existing_years.add(year) record = existing_period["record"] if year not in existing_date_ranges: existing_date_ranges[year] = [] existing_date_ranges[year].append(( record.get("start_date"), record.get("end_date") )) logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing recurring holiday: period_name="{period_name}", pattern_key="{pattern_key}", existing_years={sorted(existing_years)}' ) # Create periods for missing years for target_year in years_to_check: # Check if year already exists if target_year in existing_years: # Double-check by verifying the date range matches # This handles edge cases where same year might have been created manually should_skip = False if target_year in existing_date_ranges: # Calculate what the date range should be for this year if end_month < start_month or (end_month == start_month and end_day < start_day): # Year-spanning: target_year to target_year+1 expected_start = self.safe_create_datetime( target_year, start_month, start_day, start_hour, start_minute, tzinfo=datetime.timezone.utc ).timestamp() expected_end = self.safe_create_datetime( target_year + 1, end_month, end_day, end_hour, end_minute, tzinfo=datetime.timezone.utc ).timestamp() else: # Normal: both in target_year expected_start = self.safe_create_datetime( target_year, start_month, start_day, start_hour, start_minute, tzinfo=datetime.timezone.utc ).timestamp() expected_end = self.safe_create_datetime( target_year, end_month, end_day, end_hour, end_minute, tzinfo=datetime.timezone.utc ).timestamp() # Check if any existing period matches this date range (within same day tolerance) for existing_start, existing_end in existing_date_ranges[target_year]: if existing_start and existing_end: # Check if dates are on the same day (tolerance for time differences) existing_start_dt = datetime.datetime.fromtimestamp(existing_start, tz=datetime.timezone.utc) expected_start_dt = datetime.datetime.fromtimestamp(expected_start, tz=datetime.timezone.utc) if (existing_start_dt.year == expected_start_dt.year and existing_start_dt.month == expected_start_dt.month and existing_start_dt.day == expected_start_dt.day): should_skip = True break if should_skip: logging.debug( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, period already exists for year={target_year}, period_name="{period_name}", skipping.' ) continue # Calculate start and end dates for target year try: # Handle year-spanning holidays (e.g., Dec 31 - Jan 1) if end_month < start_month or (end_month == start_month and end_day < start_day): # Holiday spans across years (e.g., Dec 31 - Jan 1) # Start date is in target_year, end date is in target_year + 1 new_start_dt = self.safe_create_datetime( target_year, start_month, start_day, start_hour, start_minute, tzinfo=datetime.timezone.utc ) new_end_dt = self.safe_create_datetime( target_year + 1, end_month, end_day, end_hour, end_minute, tzinfo=datetime.timezone.utc ) else: # Normal holiday within the same year new_start_dt = self.safe_create_datetime( target_year, start_month, start_day, start_hour, start_minute, tzinfo=datetime.timezone.utc ) new_end_dt = self.safe_create_datetime( target_year, end_month, end_day, end_hour, end_minute, tzinfo=datetime.timezone.utc ) # Convert to epoch timestamps new_start_epoch = int(round(new_start_dt.timestamp())) new_end_epoch = int(round(new_end_dt.timestamp())) # Validate date range if new_end_epoch <= new_start_epoch: logging.warning( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, invalid date range for year={target_year}, period_name="{period_name}", skipping.' ) continue # Create new record via REST API (enables auditing and delegates complexity) try: target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/create" payload = { "period_name": period_name, "start_date": new_start_epoch, "end_date": new_end_epoch, "comment": comment, "country_code": country_code, "is_recurring": True, # Keep recurring flag } response = requests.post( target_url, headers=header, data=json.dumps(payload), verify=False, timeout=600 ) if response.status_code == 200: response_data = response.json() created_record = response_data.get("payload", {}) new_key = created_record.get("_key") new_periods_created_count += 1 logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, successfully created recurring bank holiday period via REST API: key={new_key}, period_name="{period_name}", year={target_year}, start_date={new_start_dt.strftime("%Y-%m-%d %H:%M")}, end_date={new_end_dt.strftime("%Y-%m-%d %H:%M")}' ) elif response.status_code == 409: # 409 Conflict means the period already exists (duplicate detection) # This is expected behavior, not an error - log at debug level logging.debug( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recurring bank holiday period already exists (duplicate detected): period_name="{period_name}", year={target_year}, status_code=409' ) else: periods_creation_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to create recurring bank holiday period via REST API for year={target_year}, period_name="{period_name}", status_code={response.status_code}, response={response.text}' ) except Exception as e: periods_creation_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to create recurring bank holiday period for year={target_year}, period_name="{period_name}", exception="{str(e)}"' ) except Exception as e: periods_creation_failures_count += 1 logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to calculate dates for year={target_year}, period_name="{period_name}", exception="{str(e)}"' ) except Exception as e: logging.error( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to process recurring bank holidays, exception="{str(e)}"' ) # add to global_results_dict global_results_dict[f"{task_name}"] = { "recurring_holidays_processed_count": recurring_holidays_processed_count, "new_periods_created_count": new_periods_created_count, "periods_creation_failures_count": periods_creation_failures_count, "past_periods_deleted_count": past_periods_deleted_count, "past_periods_deletion_failures_count": past_periods_deletion_failures_count, "result": f"{recurring_holidays_processed_count} recurring holidays processed, {new_periods_created_count} new periods created, {periods_creation_failures_count} creation failures, {past_periods_deleted_count} past periods deleted, {past_periods_deletion_failures_count} deletion failures", } logging.info( f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # End # # yield the results yield_record = { "_time": time.time(), "_raw": global_results_dict, "results": global_results_dict, } yield yield_record # # END # # end general task logging.info( f"instance_id={instance_id}, trackmegeneralhealthmanager has terminated, total_run_time={round(time.time() - global_start, 3)}" ) dispatch(HealthTracker, sys.argv, sys.stdin, sys.stdout, __name__)