#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Standard library imports import os import sys import time import json import uuid import threading import hashlib from logging.handlers import RotatingFileHandler # Logging imports import logging from logging.handlers import RotatingFileHandler # Networking imports import requests from requests.structures import CaseInsensitiveDict import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # splunk home splunkhome = os.environ["SPLUNK_HOME"] # set logging filehandler = RotatingFileHandler( f"{splunkhome}/var/log/splunk/trackme_tracker_health.log", mode="a", maxBytes=10000000, backupCount=1, ) formatter = logging.Formatter( "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s" ) logging.Formatter.converter = time.gmtime filehandler.setFormatter(formatter) log = logging.getLogger() # root logger - Good to get it only once. for hdlr in log.handlers[:]: # remove the existing file handlers if isinstance(hdlr, logging.FileHandler): log.removeHandler(hdlr) log.addHandler(filehandler) # set the new handler # set the log level to INFO, DEBUG as the default is ERROR log.setLevel(logging.INFO) # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # import Splunk libs from splunklib.searchcommands import ( dispatch, GeneratingCommand, Configuration, Option, validators, ) import splunklib.results as results # import trackme libs from trackme_libs import ( trackme_reqinfo, trackme_register_tenant_object_summary, trackme_delete_tenant_object_summary, trackme_vtenant_account, trackme_idx_for_tenant, trackme_state_event, trackme_register_tenant_component_summary, trackme_handler_events, trackme_manage_report_schedule, trackme_get_version, ) # import trackme licensing libs from trackme_libs_licensing import trackme_check_license # import trackme libs from trackme_libs import ( trackme_report_update_enablement, run_splunk_search, trackme_gen_state, ) # import trackme libs utils from trackme_libs_utils import remove_leading_spaces, decode_unicode # import trackme libs logical groups from trackme_libs_logicalgroup import ( get_logical_groups_collection_records, logical_group_remove_object_from_groups, logical_group_delete_group_by_name, ) # import TrackMe get data libs from trackme_libs_get_data import ( get_full_kv_collection, ) # import default vtenant account settings from collections_data import vtenant_account_default # import trackme libs sla from trackme_libs_sla import trackme_sla_gen_metrics # import trackme libs schema from trackme_libs_schema import trackme_schema_format_version @Configuration(distributed=False) class HealthTracker(GeneratingCommand): tenant_id = Option( doc=""" **Syntax:** **tenant_id=**** **Description:** The tenant identifier.""", require=True, default=None, ) get_acl = Option( doc=""" **Syntax:** **get_acl=**** **Description:** Retrieve ACLs information for the tenant knowledge objects, disabled by default as this can generate more rest traffic and load.""", require=False, default=False, validate=validators.Boolean(), ) """ Function to return a unique uuid which is used to trace performance run_time of each subtask. """ def get_uuid(self): return str(uuid.uuid4()) def register_component_summary_async( self, session_key, splunkd_uri, tenant_id, component ): try: summary_register_response = trackme_register_tenant_component_summary( session_key, splunkd_uri, tenant_id, component, ) logging.debug( f'function="trackme_register_tenant_component_summary", response="{json.dumps(summary_register_response, indent=2)}"' ) except Exception as e: logging.error( f'failed to register the component summary with exception="{str(e)}"' ) def generate(self, **kwargs): # performance counter start = time.time() # set instance_id instance_id = self.get_uuid() # Get request info and set logging level reqinfo = trackme_reqinfo( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, ) log.setLevel(reqinfo["logging_level"]) # Build header and target URL headers = CaseInsensitiveDict() headers["Authorization"] = f"Splunk {self._metadata.searchinfo.session_key}" headers["Content-Type"] = "application/json" # Create a requests session for better performance session = requests.Session() session.headers.update(headers) ########################################################################### # Verify the Virtual Tenant account with privileges escalation ########################################################################### task_start = time.time() task_instance_id = self.get_uuid() task_name = "check_vtenant_accounts" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verifying the vtenant account' ) try: vtenant_account = trackme_vtenant_account( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) except Exception as e: # target url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/configuration/admin/maintain_vtenant_account" # proceed try: response = session.post( url, data=json.dumps( {"tenant_id": self.tenant_id, "force_create_missing": True} ), verify=False, timeout=600, ) if response.status_code not in (200, 201, 204): logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verify vtenant account has failed, was this account deleted by mistake? response.status_code="{response.status_code}", response.text="{response.text}"' ) raise Exception(f'verify vtenant account has failed, was this account deleted by mistake? response.status_code="{response.status_code}", response.text="{response.text}"') else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, account was verified successfully' ) response_json = response.json() # fetch the vtenant account again vtenant_account = trackme_vtenant_account( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verify vtenant account has failed, exception="{str(e)}"' ) raise Exception(f'verify vtenant account has failed, exception="{str(e)}"') # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # # # get the target index tenant_indexes = trackme_idx_for_tenant( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, ) # get global indexes global_indexes = { "trackme_summary_idx": reqinfo["trackme_conf"]["index_settings"][ "trackme_summary_idx" ], "trackme_audit_idx": reqinfo["trackme_conf"]["index_settings"][ "trackme_audit_idx" ], "trackme_metric_idx": reqinfo["trackme_conf"]["index_settings"][ "trackme_metric_idx" ], "trackme_notable_idx": reqinfo["trackme_conf"]["index_settings"][ "trackme_notable_idx" ], } logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, global_indexes="{json.dumps(global_indexes, indent=2)}"' ) # get trackme release trackme_version = trackme_get_version( self.service, log_context={ "context_prefix": f'tenant_id="{self.tenant_id}", instance_id={instance_id}' } ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, running trackme version="{trackme_version}"' ) # set the schema_version_required schema_version_required = trackme_schema_format_version(trackme_version) # Get the session key session_key = self._metadata.searchinfo.session_key # Add the session_key to the reqinfo reqinfo["session_key"] = session_key # report name for logging purposes report_name = f"trackme_health_tracker_tenant_{self.tenant_id}" # Data collection collection_name = "kv_trackme_virtual_tenants" collection = self.service.kvstore[collection_name] # Get the tenant KVrecord query_string = { "tenant_id": self.tenant_id, } vtenant_record = collection.data.query(query=json.dumps(query_string))[0] # # check license state # task_start = time.time() task_instance_id = self.get_uuid() task_name = "check_licensing" logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) try: check_license = trackme_check_license( reqinfo["server_rest_uri"], session_key ) license_is_valid = check_license.get("license_is_valid") license_subscription_class = check_license.get("license_subscription_class") license_active_tenants = check_license.get("license_active_tenants") license_active_tenants_list = check_license.get( "license_active_tenants_list" ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function check_license called, task_instance_id={task_instance_id}, license_is_valid="{license_is_valid}", license_subscription_class="{license_subscription_class}", license_active_tenants="{license_active_tenants}", license_active_tenants_list="{license_active_tenants_list}"' ) except Exception as e: license_is_valid = 2 license_subscription_class = "unlimited" logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function check_license has failed, exception="{str(e)}"' ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # check tenants indexes settings: # - retrieve the configured indexes for the tenant # - retrieve via a REST call to splunkd the list of declared indexes on the search head # - if any of the tenant defines indexes are not declared on the search head, update the tenant indexes settings to fallback to TrackMe default indexes and log the issue # task_start = time.time() task_instance_id = self.get_uuid() task_name = "check_tenants_indexes_settings" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) def get_indexes_by_datatype(datatype=None): """Retrieve indexes from the search head by datatype. Args: datatype (str, optional): The datatype to filter by (e.g. 'metric'). If None, retrieves all indexes. Returns: dict: Dictionary of index names and their datatypes """ url = f"{reqinfo['server_rest_uri']}/services/data/indexes?output_mode=json&count=0" if datatype: url += f"&datatype={datatype}" try: response = requests.get(url, headers=headers, verify=False, timeout=600) if response.status_code == 200: indexes_raw = response.json().get("entry", []) for index in indexes_raw: if isinstance(index, dict): index_name = index.get("name") if index_name: declared_indexes_dict[index_name] = { "datatype": index.get("content", {}).get( "datatype", "" ) } logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, declared_indexes="{json.dumps(declared_indexes_dict, indent=2)}"' ) else: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve indexes list, status code: {response.status_code}' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, could not retrieve the list of declared indexes on the search head, exception="{str(e)}"' ) def get_fallback_indexes(index_category=None): """Retrieve fallback indexes from the search head. Returns: dict: Dictionary of fallback indexes """ fallback_indexes = { "trackme_summary_idx": "trackme_summary", "trackme_audit_idx": "trackme_audit", "trackme_metric_idx": "trackme_metrics", "trackme_notable_idx": "trackme_notable", } if index_category: return fallback_indexes.get(index_category, None) else: return fallback_indexes # get the tenant indexes settings tenant_indexes_settings = trackme_idx_for_tenant( session_key, reqinfo["server_rest_uri"], self.tenant_id, ) logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_indexes_settings="{json.dumps(tenant_indexes_settings, indent=2)}"' ) """ Example of tenant_indexes_settings: { "trackme_summary_idx": "trackme_summary", "trackme_audit_idx": "trackme_audit", "trackme_metric_idx": "trackme_metrics", "trackme_notable_idx": "trackme_notable" } """ # check if tenant_indexes_settings is set to global tenant_indexes_uses_global_indexes = False if tenant_indexes_settings == "global": tenant_indexes_settings = global_indexes tenant_indexes_uses_global_indexes = True logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_indexes_settings set to global, will check the search head for declared indexes.' ) # process declared_indexes_dict = {} # Get all indexes (events) get_indexes_by_datatype() # Get metrics indexes get_indexes_by_datatype(datatype="metric") # only proceed if we have declared indexes if not declared_indexes_dict: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no declared indexes found, skipping tenant indexes settings check.' ) return # for each index in the tenant indexes settings, check if it is declared on the search head # we also want to check for trackme_metrics_idx that the datatype is set to "metric" # if not, we will force update the tenant indexes settings to fallback to TrackMe default indexes invalid_indexes_settings_detected = False # process the tenant indexes settings for index_category, index_value in tenant_indexes_settings.items(): if not isinstance(index_value, str): logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}" has invalid index value type: {type(index_value)}' ) invalid_indexes_settings_detected = True # update the tenant indexes settings for the current index_category tenant_indexes_settings[index_category] = get_fallback_indexes( index_category ) continue if index_value not in declared_indexes_dict: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}", index_value="{index_value}" is not declared on the search head, this is an invalid configuration, we will force update the tenant indexes settings to fallback to TrackMe default indexes. Please ensure to define indexes in the search head tier before attempting to configure your tenant indexes settings.' ) invalid_indexes_settings_detected = True # update the tenant indexes settings for the current index_category tenant_indexes_settings[index_category] = get_fallback_indexes( index_category ) continue elif index_category == "trackme_metrics_idx": index_info = declared_indexes_dict.get(index_value, {}) if index_info.get("datatype") != "metric": logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}", index_value="{index_value}" is not configured as a metric index, this is an invalid configuration, we will force update the tenant indexes settings to fallback to TrackMe default indexes.' ) invalid_indexes_settings_detected = True # update the tenant indexes settings for the current index_category tenant_indexes_settings[index_category] = get_fallback_indexes( index_category ) continue if not invalid_indexes_settings_detected: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no invalid indexes settings detected, nothing to do.' ) else: # If we were using global indexes and found issues, we need to fallback to default indexes if tenant_indexes_uses_global_indexes: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, issues detected with global indexes, falling back to default indexes.' ) tenant_indexes_settings = get_fallback_indexes() # fix the tenant indexes settings vtenant_record["tenant_idx_settings"] = json.dumps( tenant_indexes_settings, indent=2 ) try: self.service.kvstore["kv_trackme_virtual_tenants"].data.update( str(vtenant_record["_key"]), json.dumps(vtenant_record) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully, new tenant_idx_settings="{json.dumps(tenant_indexes_settings, indent=2)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ################################################################################## # Global system verifications: verify that the relevant scheduled jobs are enabled ################################################################################## # These jobs are not tenant specifics, however we use the health tracker to ensure that # these are effectively enabled when at least one tenant has been created and is active task_start = time.time() task_instance_id = self.get_uuid() task_name = "check_global_trackers_enablement" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) savedsearch_names = [ "trackme_ack_expiration_tracker", "trackme_maintenance_mode_tracker", "trackme_backup_scheduler", "trackme_general_health_manager", ] for savedsearch_name in savedsearch_names: # check ack expiration tracker update_properties_required = False try: mysavedsearch = self.service.saved_searches[savedsearch_name] current_disabled = int(mysavedsearch["disabled"]) logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, verifying savedsearch="{mysavedsearch.name}", disabled="{current_disabled}"' ) if current_disabled == 1: update_properties_required = True except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global configuration verification, could not retrieve the status for {savedsearch_name}' ) if update_properties_required: try: action = trackme_report_update_enablement( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, savedsearch_name, "enable", ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, enabling savedsearch="{savedsearch_name}", result="{action}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, an exception was encountered while trying to enable savedsearch="{savedsearch_name}", exception="{str(e)}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ################################################################################## # Optimize: enable or disable the schedule for utilities depending on the tenant # settings, and conditions ################################################################################## task_start = time.time() task_instance_id = self.get_uuid() task_name = "optimize_tenant_scheduled_reports" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # Define the valid components valid_components = {"dsm", "dhm", "mhm", "flx", "wlk", "fqm"} def manage_savedsearch_schedule( savedsearch_names, feature_enabled, feature_name ): """ Helper function to manage saved search scheduling based on feature enablement. Args: savedsearch_names: List of saved search names to manage feature_enabled: Boolean indicating if the feature should be enabled feature_name: String name of the feature for logging purposes """ for savedsearch_name in savedsearch_names: # get the status of the savedsearch savedsearch_properties, savedsearch_acl = ( trackme_manage_report_schedule( logging, session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, savedsearch_name, action="status", ) ) # log logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", savedsearch_properties="{json.dumps(savedsearch_properties, indent=2)}", savedsearch_acl="{json.dumps(savedsearch_acl, indent=2)}"' ) # get the is_scheduled status is_scheduled = int(savedsearch_properties.get("is_scheduled", 0)) # avoid failing to schedule the savedsearch if any of the following is missing or equal to None: # dispatch.earliest_time # dispatch.latest_time # cron_schedule # schedule_window # outliers_mltrain: # "cron_schedule": "*/60 * * * *", # "dispatch.earliest_time": "-5m", # "dispatch.latest_time": "now", # "schedule_window": "5", # outliers_mlmonitor: # "cron_schedule": "*/20 * * * *", # "dispatch.earliest_time": "-5m", # "dispatch.latest_time": "now", # "schedule_window": "5", # data_sampling: # "cron_schedule": "*/20 * * * *", # "dispatch.earliest_time": "-24h", # "dispatch.latest_time": "-4h", # "schedule_window": "5", # adaptive_delay: # "cron_schedule": "*/20 * * * *", # "dispatch.earliest_time": "-5m", # "dispatch.latest_time": "now", # "schedule_window": "5", # delayed_inspector: # "cron_schedule": "*/20 * * * *", # "dispatch.earliest_time": "-5m", # "dispatch.latest_time": "now", # "schedule_window": "5", # if any of these parameters is missing in the savedsearch properties, we need to add them if "dispatch.earliest_time" not in savedsearch_properties or savedsearch_properties.get("dispatch.earliest_time") in (None, 'None', ''): if "outliers_mltrain" in savedsearch_name: savedsearch_properties["dispatch.earliest_time"] = "-5m" elif "outliers_mlmonitor" in savedsearch_name: savedsearch_properties["dispatch.earliest_time"] = "-5m" elif "data_sampling" in savedsearch_name: savedsearch_properties["dispatch.earliest_time"] = "-24h" elif "adaptive_delay" in savedsearch_name: savedsearch_properties["dispatch.earliest_time"] = "-5m" elif "delayed_entities_inspector" in savedsearch_name: savedsearch_properties["dispatch.earliest_time"] = "-5m" else: savedsearch_properties["dispatch.earliest_time"] = "-5m" if "dispatch.latest_time" not in savedsearch_properties or savedsearch_properties.get("dispatch.latest_time") in (None, 'None', ''): if "outliers_mltrain" in savedsearch_name: savedsearch_properties["dispatch.latest_time"] = "now" elif "outliers_mlmonitor" in savedsearch_name: savedsearch_properties["dispatch.latest_time"] = "now" elif "data_sampling" in savedsearch_name: savedsearch_properties["dispatch.latest_time"] = "-4h" elif "adaptive_delay" in savedsearch_name: savedsearch_properties["dispatch.latest_time"] = "now" elif "delayed_entities_inspector" in savedsearch_name: savedsearch_properties["dispatch.latest_time"] = "now" else: savedsearch_properties["dispatch.latest_time"] = "now" if "cron_schedule" not in savedsearch_properties or savedsearch_properties.get("cron_schedule") in (None, 'None', ''): if "outliers_mltrain" in savedsearch_name: savedsearch_properties["cron_schedule"] = "0 22-23,0-6 * * *" elif "outliers_mlmonitor" in savedsearch_name: savedsearch_properties["cron_schedule"] = "*/20 * * * *" elif "data_sampling" in savedsearch_name: savedsearch_properties["cron_schedule"] = "*/20 22-23,0-6 * * *" elif "adaptive_delay" in savedsearch_name: savedsearch_properties["cron_schedule"] = "*/20 * * * *" elif "delayed_entities_inspector" in savedsearch_name: savedsearch_properties["cron_schedule"] = "*/20 * * * *" else: savedsearch_properties["cron_schedule"] = "*/5 * * * *" if "schedule_window" not in savedsearch_properties or savedsearch_properties.get("schedule_window") in (None, 'None', ''): savedsearch_properties["schedule_window"] = "5" # act if is_scheduled == 1 and feature_enabled == False: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", disabling savedsearch.' ) try: savedsearch_properties, savedsearch_acl = ( trackme_manage_report_schedule( logging, session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, savedsearch_name, input_report_properties=savedsearch_properties, action="disable", ) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch updated successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to update savedsearch, exception="{str(e)}"' ) elif is_scheduled == 0 and feature_enabled == True: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", enabling savedsearch.' ) try: savedsearch_properties, savedsearch_acl = ( trackme_manage_report_schedule( logging, session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, savedsearch_name, input_report_properties=savedsearch_properties, action="enable", ) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch updated successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to update savedsearch, exception="{str(e)}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", nothing to do.' ) # Process except for replica tenants try: tenant_replica = int(vtenant_record.get("tenant_replica", 0)) except Exception as e: tenant_replica = 0 if tenant_replica == 1: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, detected replica tenant by name pattern, setting tenant_replica=1' ) # Log replica tenant status for debugging logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_replica="{tenant_replica}", will_process="{tenant_replica == 0}"' ) if tenant_replica == 0: # only process non-replica tenants (value: 0) for valid_component in valid_components: valid_component_is_enabled = int( vtenant_record.get(f"tenant_{valid_component}_enabled", 0) ) if valid_component_is_enabled == 1: # only for dsm/dhm/flx/wlk if valid_component in ("dsm", "dhm", "flx", "wlk", "fqm"): # # ML Outliers # try: savedsearch_names = [ f"trackme_{valid_component}_outliers_mltrain_tracker_tenant_{self.tenant_id}", f"trackme_{valid_component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}", ] # Default to True feature_enabled = True # Construct the key dynamically key = f"mloutliers_{valid_component}" # Check if the component is valid and handle exceptions if valid_component in valid_components: try: feature_enablement = int(vtenant_account.get(key, 1)) if feature_enablement == 0: feature_enabled = False except (ValueError, TypeError): feature_enabled = True else: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}' ) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "outliers" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # Sampling (dsm only) # try: if valid_component == "dsm": savedsearch_names = [ f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}", ] # Default to True feature_enabled = True # Construct the key dynamically key = f"sampling" # Check if the component is valid and handle exceptions if valid_component in valid_components: try: feature_enablement = int(vtenant_account.get(key, 1)) if feature_enablement == 0: feature_enabled = False except (ValueError, TypeError): feature_enabled = True else: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}' ) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "sampling" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # Adaptive delay (dsm only) # try: if valid_component == "dsm": savedsearch_names = [ f"trackme_dsm_adaptive_delay_tracker_tenant_{self.tenant_id}", ] # Default to True feature_enabled = True # Construct the key dynamically key = f"adaptive_delay" # Check if the component is valid and handle exceptions if valid_component in valid_components: try: feature_enablement = int(vtenant_account.get(key, 1)) if feature_enablement == 0: feature_enabled = False except (ValueError, TypeError): feature_enabled = True else: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}' ) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "adaptive_delay" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # Delayed inspector (dsm/dhm only) # try: if valid_component in ("dsm", "dhm"): savedsearch_names = [ f"trackme_{valid_component}_delayed_entities_inspector_tracker_tenant_{self.tenant_id}", ] # Default to True feature_enabled = True # Construct the key dynamically keys = [ "splk_feeds_delayed_inspector_24hours_range_min_sec", "splk_feeds_delayed_inspector_7days_range_min_sec", "splk_feeds_delayed_inspector_until_disabled_range_min_sec", ] # Check if the component is valid and handle exceptions (all keys must be set to 0 for the feature to be disabled) if valid_component in valid_components: try: feature_enabled = True # Default to enabled for key in keys: feature_enablement = int( vtenant_account.get(key, 1) ) if feature_enablement != 0: # If any key is not 0, the feature should be enabled feature_enabled = True break else: # If we get here, all keys were 0, so disable the feature feature_enabled = False except (ValueError, TypeError): feature_enabled = True else: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}' ) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "delayed_inspector" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # Priority policies: depends on if we have content in the KVstore collection # try: savedsearch_names = [ f"trackme_{valid_component}_priority_tracker_tenant_{self.tenant_id}", ] priority_collection_name = f"kv_trackme_{valid_component}_priority_policies_tenant_{self.tenant_id}" priority_collection = self.service.kvstore[priority_collection_name] ( priority_records, priority_collection_keys, priority_collection_dict, ) = get_full_kv_collection( priority_collection, priority_collection_name ) # check if we have content in the collection logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", priority_collection_name="{priority_collection_name}", priority_records_count="{len(priority_records)}"' ) feature_enabled = bool(priority_records) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "priority_policies" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # Tags policies: depends on if we have content in the KVstore collection # try: savedsearch_names = [ f"trackme_{valid_component}_tags_tracker_tenant_{self.tenant_id}", ] tags_collection_name = f"kv_trackme_{valid_component}_tags_policies_tenant_{self.tenant_id}" tags_collection = self.service.kvstore[tags_collection_name] tags_records, tags_collection_keys, tags_collection_dict = ( get_full_kv_collection(tags_collection, tags_collection_name) ) # check if we have content in the collection logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", tags_collection_name="{tags_collection_name}", tags_records_count="{len(tags_records)}"' ) feature_enabled = bool(tags_records) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "tags_policies" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # SLA policies: depends on if we have content in the KVstore collection # try: savedsearch_names = [ f"trackme_{valid_component}_sla_tracker_tenant_{self.tenant_id}", ] sla_collection_name = f"kv_trackme_{valid_component}_sla_policies_tenant_{self.tenant_id}" sla_collection = self.service.kvstore[sla_collection_name] sla_records, sla_collection_keys, sla_collection_dict = ( get_full_kv_collection(sla_collection, sla_collection_name) ) # check if we have content in the collection logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", tags_collection_name="{tags_collection_name}", tags_records_count="{len(sla_records)}"' ) feature_enabled = bool(tags_records) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "sla_policies" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) # # Shared Elastic Tracker: depends on if we have content in the KVstore collection (dsm only) # try: if valid_component == "dsm": savedsearch_names = [ f"trackme_dsm_shared_elastic_tracker_tenant_{self.tenant_id}", ] shared_elastic_collection_name = ( f"kv_trackme_dsm_elastic_shared_tenant_{self.tenant_id}" ) shared_elastic_collection = self.service.kvstore[ shared_elastic_collection_name ] ( shared_elastic_records, shared_elastic_collection_keys, shared_elastic_collection_dict, ) = get_full_kv_collection( shared_elastic_collection, shared_elastic_collection_name ) # check if we have content in the collection logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", shared_elastic_collection_name="{shared_elastic_collection_name}", shared_elastic_records_count="{len(shared_elastic_records)}"' ) feature_enabled = bool(shared_elastic_records) manage_savedsearch_schedule( savedsearch_names, feature_enabled, "shared_elastic" ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"' ) else: # Skip processing for replica tenants logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, skipping replica tenant processing, tenant_replica="{tenant_replica}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ################################################################################## # Replica orchestrator ################################################################################## # This job scheduled will automatically be enabled if we detect that at least one # replica tracker has been created task_start = time.time() task_instance_id = self.get_uuid() task_name = "replica_orchestrator" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # Try to get the current definition try: tenant_replica_objects = vtenant_record.get("tenant_replica_objects") # logging debug logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_replica_objects="{tenant_replica_objects}"' ) except Exception as e: tenant_replica_objects = None # only run if we have a proper replica object if tenant_replica_objects: savedsearch_names = [ "trackme_replica_executor", ] for savedsearch_name in savedsearch_names: # check update_properties_required = False try: mysavedsearch = self.service.saved_searches[savedsearch_name] current_disabled = int(mysavedsearch["disabled"]) logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, verifying savedsearch="{mysavedsearch.name}", disabled="{current_disabled}"' ) if current_disabled == 1: update_properties_required = True except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica configuration verification, could not retrieve the status for {savedsearch_name}' ) if update_properties_required: try: action = trackme_report_update_enablement( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, savedsearch_name, "enable", ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, enabling savedsearch="{savedsearch_name}", result="{action}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, an exception was encountered while trying to enable savedsearch="{savedsearch_name}", exception="{str(e)}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) ########################################################################### # schema update and migration: detect and migrate Virtual Tenants if needed ########################################################################### task_start = time.time() task_instance_id = self.get_uuid() task_name = "schema_upgrade" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) from trackme_libs_schema import ( trackme_schema_get_version, trackme_schema_update_version, trackme_schema_upgrade_2009, trackme_schema_upgrade_2015, trackme_schema_upgrade_2016, trackme_schema_upgrade_2020, trackme_schema_upgrade_2026, trackme_schema_upgrade_2034, trackme_schema_upgrade_2034_least_privileges, trackme_schema_upgrade_2036, trackme_schema_upgrade_2038, trackme_schema_upgrade_2043, trackme_schema_upgrade_2044, trackme_schema_upgrade_2045, trackme_schema_upgrade_2054, trackme_schema_upgrade_2064, trackme_schema_upgrade_2067, trackme_schema_upgrade_2070, trackme_schema_upgrade_2071, trackme_schema_upgrade_2072, trackme_schema_upgrade_2075, trackme_schema_upgrade_2078, trackme_schema_upgrade_2083, trackme_schema_upgrade_2084, trackme_schema_upgrade_2087, trackme_schema_upgrade_2089, trackme_schema_upgrade_2090, trackme_schema_upgrade_2091, trackme_schema_upgrade_2094, trackme_schema_upgrade_2095, trackme_schema_upgrade_2096, trackme_schema_upgrade_2097, trackme_schema_upgrade_2098, trackme_schema_upgrade_2099, trackme_schema_upgrade_2100, trackme_schema_upgrade_2101, trackme_schema_upgrade_2102, trackme_schema_upgrade_2104, trackme_schema_upgrade_2105, trackme_schema_upgrade_2107, trackme_schema_upgrade_2108, trackme_schema_upgrade_2109, trackme_schema_upgrade_2110, trackme_schema_upgrade_2111, trackme_schema_upgrade_2116, trackme_schema_upgrade_2118, trackme_schema_upgrade_2119, trackme_schema_upgrade_2121, trackme_schema_upgrade_2122, trackme_schema_upgrade_2123, trackme_schema_upgrade_2126, trackme_schema_upgrade_2128, trackme_schema_upgrade_2130, trackme_schema_upgrade_2131, trackme_schema_upgrade_2132, trackme_schema_upgrade_2300, trackme_schema_upgrade_2304, trackme_schema_upgrade_2305, ) # Define a mapping between schema versions and their upgrade functions schema_upgrades = [ (2009, trackme_schema_upgrade_2009), (2015, trackme_schema_upgrade_2015), (2016, trackme_schema_upgrade_2016), (2020, trackme_schema_upgrade_2020), (2026, trackme_schema_upgrade_2026), (2034, trackme_schema_upgrade_2034), (2034, trackme_schema_upgrade_2034_least_privileges), (2036, trackme_schema_upgrade_2036), (2038, trackme_schema_upgrade_2038), (2043, trackme_schema_upgrade_2043), (2043, trackme_schema_upgrade_2044), (2045, trackme_schema_upgrade_2045), (2054, trackme_schema_upgrade_2054), (2064, trackme_schema_upgrade_2064), (2067, trackme_schema_upgrade_2067), (2070, trackme_schema_upgrade_2070), (2071, trackme_schema_upgrade_2071), (2072, trackme_schema_upgrade_2072), (2075, trackme_schema_upgrade_2075), (2078, trackme_schema_upgrade_2078), (2083, trackme_schema_upgrade_2083), (2084, trackme_schema_upgrade_2084), (2087, trackme_schema_upgrade_2087), (2089, trackme_schema_upgrade_2089), (2090, trackme_schema_upgrade_2090), (2091, trackme_schema_upgrade_2091), (2094, trackme_schema_upgrade_2094), (2095, trackme_schema_upgrade_2095), (2096, trackme_schema_upgrade_2096), (2097, trackme_schema_upgrade_2097), (2098, trackme_schema_upgrade_2098), (2099, trackme_schema_upgrade_2099), (2100, trackme_schema_upgrade_2100), (2101, trackme_schema_upgrade_2101), (2102, trackme_schema_upgrade_2102), (2104, trackme_schema_upgrade_2104), (2105, trackme_schema_upgrade_2105), (2107, trackme_schema_upgrade_2107), (2108, trackme_schema_upgrade_2108), (2109, trackme_schema_upgrade_2109), (2110, trackme_schema_upgrade_2110), (2111, trackme_schema_upgrade_2111), (2116, trackme_schema_upgrade_2116), (2118, trackme_schema_upgrade_2118), (2119, trackme_schema_upgrade_2119), (2121, trackme_schema_upgrade_2121), (2122, trackme_schema_upgrade_2122), (2123, trackme_schema_upgrade_2123), (2126, trackme_schema_upgrade_2126), (2128, trackme_schema_upgrade_2128), (2130, trackme_schema_upgrade_2130), (2131, trackme_schema_upgrade_2131), (2132, trackme_schema_upgrade_2132), (2300, trackme_schema_upgrade_2300), (2304, trackme_schema_upgrade_2304), (2305, trackme_schema_upgrade_2305), ] # Get the current schema version try: schema_version = trackme_schema_get_version( reqinfo, self.tenant_id, schema_version_required, task_name, task_instance_id, ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function trackme_schema_get_version, exception="{str(e)}"' ) # If schema_version_required is 0 (version retrieval failed), skip upgrade logic # to align with graceful degradation when DB Connect causes permission issues if schema_version_required == 0: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema_version_required is 0 (version retrieval failed), skipping schema upgrade logic to prevent data corruption.' ) # Proceed elif not schema_version or int(schema_version) != int(schema_version_required): # # Backup # # Check and act accordingly trackme_backup_attempted = False # Run TrackMe backup: verify if a backup was initiated or performed during the last 24 hours, otherwise initiate a backup trackme_backup_run = True # recent_backup_events_count recent_backup_events_count = 0 # recent_backup_events_raw recent_backup_events_raw = [] # run a Splunk search to identify the last backup initiated time search = remove_leading_spaces( f"""\ search (index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth task=schema_upgrade "initiating backup now") OR (index=_internal sourcetype=trackme:rest_api trackme.rest.backup_and_restore trackme_rest_handler_backup_and_restore.py post_backup "Backup archive created successfully") | stats count, values(_raw) as last_events """ ) # kwargs kwargs_search = { "earliest_time": "-24h", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, inspecting logs to identify any recent backups.' ) try: reader = run_splunk_search( self.service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): recent_backup_events_count = int(item.get("count", 0)) recent_backup_events_raw = item.get("last_events", []) except Exception as e: msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recent backup identification search failed with exception="{str(e)}"' logging.error(msg) # if we have detected a recent backup, we will not run a backup if recent_backup_events_count > 0: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recent backup was detected, no backup will be initiated, recent_backup_events_count="{recent_backup_events_count}", recent_backup_events_raw="{recent_backup_events_raw}"' ) trackme_backup_run = False else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no recent backup was detected, initiating backup now, recent_backup_events_count="{recent_backup_events_count}", recent_backup_events_raw="{recent_backup_events_raw}"' ) # before running the first function, execute TrackMe's builtin backup job if trackme_backup_run: if not trackme_backup_attempted: try: response = session.post( f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/backup_and_restore/backup", data=json.dumps( { "comment": f"Backup initiated for schema migration from version {schema_version} to {schema_version_required}" } ), verify=False, timeout=900, ) if response.status_code not in (200, 201, 204): logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call has failed, response.status_code="{response.status_code}", response.text="{response.text}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call executed successfully' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call has failed, exception="{str(e)}"' ) trackme_backup_attempted = True # # schema upgrade # for version, upgrade_func in schema_upgrades: if not schema_version or int(schema_version) < version: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, detected migration required for schema version {version}, schema_version="{schema_version}", schema_version_required="{schema_version_required}", processing now.' ) # proceed try: schema_version_update = upgrade_func( reqinfo, self.tenant_id, int(schema_version), int(schema_version_required), task_name, task_instance_id, ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema version {version} migrated successfully.' ) # Update schema version after each successful upgrade try: schema_version_update = trackme_schema_update_version( reqinfo, self.tenant_id, version, # Update to current version being processed task_name, task_instance_id, ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema version updated to {version} after successful upgrade.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to update schema version to {version}, exception="{str(e)}"' ) raise # Re-raise the exception to stop the upgrade process except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function {upgrade_func.__name__}, exception="{str(e)}"' ) raise # Re-raise the exception to stop the upgrade process # # finally migrate the schema version to the required version if not already there # try: if int(schema_version) != int(schema_version_required): schema_version_update = trackme_schema_update_version( reqinfo, self.tenant_id, schema_version_required, task_name, task_instance_id, ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, final schema version updated to {schema_version_required}.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function trackme_schema_update_version, exception="{str(e)}"' ) # # check if the vtenant is the last enabled vtenant to be upgraded, if so we will execute the general health tracker # vtenants_records = collection.data.query() vtenants_remaining_count = 0 # iterate through vtenant records, count remaining vtenants to be upgraded for record in vtenants_records: schema_version_raw = record.get("schema_version") # If schema_version is None (e.g., tenant was created when version retrieval failed), # treat it as needing an upgrade if schema_version_raw is None: schema_version_needs_upgrade = True else: schema_version_needs_upgrade = int(schema_version_raw) != int(schema_version_required) if ( schema_version_needs_upgrade and record.get("tenant_status") == "enabled" ): vtenants_remaining_count += 1 if vtenants_remaining_count == 0: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, all vtenants are up to date, executing the general health tracker' ) try: reader = run_splunk_search( self.service, "| savedsearch trackme_general_health_manager", { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, }, 24, 5, ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, general health tracker executed successfully' ) except Exception as e: msg = f'permanently failed to execute the general health tracker search, exception="{str(e)}"' logging.error(msg) raise Exception(msg) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema is up to date, no action required, schema_version="{schema_version}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # # # # all components - inspect_collection # # context: this activty verifies that the collection record object statuses are consistent according to the Decision Maker # It works by loading the component dta, then looping trough objects to verify and update their collection status if needed for component in ("dsm", "dhm", "mhm", "wlk", "flx", "fqm"): if vtenant_record.get(f"tenant_{component}_enabled") == True: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting collection records object statuses now.' ) # set collection target inspect_collection_name = ( f"kv_trackme_{component}_tenant_{self.tenant_id}" ) inspect_collection = self.service.kvstore[inspect_collection_name] # # subtask: permanently_deleted_records_inspection # task_instance_id = self.get_uuid() task_start = time.time() task_name = "inspect_collection:permanently_deleted_records_inspection" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # # Check permanently deleted records: # A permanently deleted record should not exist in the main KVstore collection, if it does, it should be purged # # Lists to store permanently deleted records found in anomaly collection_permanently_deleted_records_anomaly = [] # search search = remove_leading_spaces( f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key | lookup trackme_common_permanently_deleted_objects_tenant_{self.tenant_id} object, object_category OUTPUT _key as permanently_deleted_keys | where isnotnull(permanently_deleted_keys) | table keyid, * """ ) # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for permanently deleted records now.' ) try: reader = run_splunk_search( self.service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): collection_permanently_deleted_records_anomaly.append(item) except Exception as e: msg = f'permanently deleted records inspection search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) if len(collection_permanently_deleted_records_anomaly) > 0: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, permanently deleted records found, no_records="{len(collection_permanently_deleted_records_anomaly)}"' ) for record in collection_permanently_deleted_records_anomaly: try: inspect_collection.data.delete( json.dumps({"_key": record.get("keyid")}) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities in the main collection which are also in ther permanently deleted records were purged successfully, keyid="{record.get("keyid")}", record="{json.dumps(record, indent=1)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to delete permanently deleted records in anmaly, keyid="{record.get("keyid")}", , record="{json.dumps(record, indent=1)}", exception="{str(e)}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no permanenlty deleted records in anomaly found' ) # # Check for any duplicated records in the permanently deleted records collection, based on the object field # permanently_deleted_records_collection_name = f"kv_trackme_common_permanently_deleted_objects_tenant_{self.tenant_id}" permanently_deleted_records_collection = self.service.kvstore[permanently_deleted_records_collection_name] ( permanently_deleted_records, permanently_deleted_collection_keys, permanently_deleted_collection_dict, ) = get_full_kv_collection( permanently_deleted_records_collection, permanently_deleted_records_collection_name ) # Detect duplicated records (same "(object, object_category)") and collect keys to delete (keep first seen) duplicated_pd_keys = [] seen_pairs = set() for pd_key, pd_record in permanently_deleted_collection_dict.items(): object_value = pd_record.get("object") object_category = pd_record.get("object_category") if not object_value or not object_category: continue pair = (object_value, object_category) if pair in seen_pairs: duplicated_pd_keys.append(pd_key) else: seen_pairs.add(pair) if len(duplicated_pd_keys) > 0: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, permanently deleted records collection has duplicates, duplicates_count="{len(duplicated_pd_keys)}"' ) for pd_key in duplicated_pd_keys: try: permanently_deleted_records_collection.data.delete(json.dumps({"_key": pd_key})) # best-effort to fetch object for logging pd_record = permanently_deleted_collection_dict.get(pd_key, {}) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicate in permanently deleted records purged successfully, keyid="{pd_key}", object="{pd_record.get("object")}", object_category="{pd_record.get("object_category")}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to purge duplicate in permanently deleted records, keyid="{pd_key}", exception="{str(e)}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no duplicates found in permanently deleted records collection' ) # end subtask logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # subtask: corrupted_records_inspection # task_start = time.time() task_instance_id = self.get_uuid() task_name = "inspect_collection:corrupted_records_inspection" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # # Check for unexpected corrupted record, a foreign record which has been stored in the KVstore by mistake # would not have an object value, and would be purged if any. # # Lists to store corrupted records collection_corrupted_records = [] # search search = remove_leading_spaces( f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key | where isnull(object) OR object="" | table keyid, * """ ) # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for corrupted records now.' ) try: reader = run_splunk_search( self.service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): collection_corrupted_records.append(item) except Exception as e: msg = f'corrupted record inspection search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) if len(collection_corrupted_records) > 0: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, corrupted records found, no_records="{len(collection_corrupted_records)}"' ) for corrupted_record in collection_corrupted_records: try: inspect_collection.data.delete( json.dumps({"_key": corrupted_record.get("keyid")}) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, corrupted record deleted successfully, keyid="{corrupted_record.get("keyid")}", record="{json.dumps(corrupted_record, indent=1)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to delete corrupted record, keyid="{corrupted_record.get("keyid")}", , record="{json.dumps(corrupted_record, indent=1)}", exception="{str(e)}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no corrupted records found' ) # end subtask logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # subtask: missing_tenant_id_records_inspection # task_start = time.time() task_instance_id = self.get_uuid() task_name = "inspect_collection:missing_tenant_id_records_inspection" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # # Check for records which would miss the tenant_id field, and add it if needed # # collection_misggin_tenant_id_records records collection_missing_tenant_id_records = [] # search search = remove_leading_spaces( f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key | where isnull(tenant_id) OR tenant_id="" | table keyid, * """ ) # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for corrupted records now.' ) try: reader = run_splunk_search( self.service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): collection_missing_tenant_id_records.append(item) except Exception as e: msg = f'missing tenant_id record inspection search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) if len(collection_missing_tenant_id_records) > 0: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, records found, no_records="{len(collection_missing_tenant_id_records)}"' ) for missing_record in collection_missing_tenant_id_records: try: missing_record["tenant_id"] = self.tenant_id inspect_collection.data.update( missing_record.get("_key"), json.dumps(missing_record), ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, record updated successfully, keyid="{missing_record.get("keyid")}", record="{json.dumps(missing_record, indent=1)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to update record, keyid="{missing_record.get("keyid")}", , record="{json.dumps(missing_record, indent=1)}", exception="{str(e)}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no records found' ) # end subtask logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # subtask: entities_auto_disablement # task_start = time.time() task_instance_id = self.get_uuid() task_name = "inspect_collection:entities_auto_disablement" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # # Check for feeds entities to be disabled according to the system wide setting: splk_general_feeds_auto_disablement_period # This setting allows to disable feeds entities if they have not been updated for a certain period of time # # system wide setting try: splk_general_feeds_auto_disablement_period = reqinfo["trackme_conf"][ "splk_general" ]["splk_general_feeds_auto_disablement_period"] except Exception as e: logging.warning(f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to get system wide setting, splk_general_feeds_auto_disablement_period, using default value, exception="{str(e)}"') splk_general_feeds_auto_disablement_period = "90d" # tenant setting (override system wide setting, if set) try: splk_feeds_auto_disablement_period = vtenant_account.get( "splk_feeds_auto_disablement_period" ) except Exception as e: logging.warning(f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to get tenant setting, splk_feeds_auto_disablement_period, using system wide setting, exception="{str(e)}"') splk_feeds_auto_disablement_period = splk_general_feeds_auto_disablement_period # handle auto_disablement_period = ( splk_feeds_auto_disablement_period if splk_feeds_auto_disablement_period else splk_general_feeds_auto_disablement_period ) if auto_disablement_period != "0d" and component in ( "dsm", "dhm", "mhm", ): # Lists to store entities to be disabled entities_to_be_disabled = [] # search search = remove_leading_spaces( f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key | eval last_time_seen=coalesce(data_last_time_seen, metric_last_time_seen) | where last_time_seen<=relative_time(now(), "-{auto_disablement_period}") | table keyid, object, last_time_seen | eval last_time_seen_human=strftime(last_time_seen, "%c") """ ) # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for entities to be disabled according to auto-disablement setting. (auto_disablement_period="{auto_disablement_period}")' ) try: reader = run_splunk_search( self.service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): entities_to_be_disabled.append(item.get("keyid")) except Exception as e: msg = f'auto-disablement record inspection search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) if len(entities_to_be_disabled) > 0: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities to be disabled were found, list="{entities_to_be_disabled}"' ) # turn entities_to_be_disabled list into CSV entities_to_be_disabled_csv = ",".join(entities_to_be_disabled) # call mass disablement endpoint if component == "dsm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_monitoring" elif component == "dhm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dhm/write/dh_monitoring" elif component == "mhm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_mhm/write/mh_monitoring" try: response = session.post( target_url, data=json.dumps( { "tenant_id": self.tenant_id, "keys_list": entities_to_be_disabled_csv, "action": "disable", "update_comment": f"auto-disabled by the system, last seen data is beyond the system wide auto-disablement period of {splk_general_feeds_auto_disablement_period}", } ), verify=False, timeout=600, ) if response.status_code not in (200, 201, 204): msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, query has failed, response.status_code="{response.status_code}", response.text="{response.text}"' logging.error(msg) else: try: success_count = response.json().get("success_count") except Exception as e: success_count = 0 msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, request was successful, success_count="{success_count}"' logging.info(msg) except Exception as e: msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, ctask="{task_name}", task_instance_id={task_instance_id}, request failed with exception="{str(e)}"' logging.info(msg) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no entities to be disabled were found' ) # end subtask logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # subtask: handle_sync_entities # task_start = time.time() task_instance_id = self.get_uuid() task_name = "inspect_collection:handle_sync_entities" # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # # Inspecting statuses # # # START raw collections records: Get raw collection records using a Splunk search # # search search = remove_leading_spaces( f"""\ | trackmegetcoll tenant_id="{self.tenant_id}" component="{component}" | fields - _raw | table * | lookup trackme_{component}_tenant_{self.tenant_id} _key as keyid OUTPUT object_state as kvcoll_object_state, anomaly_reason as kvcoll_anomaly_reason, latest_flip_time as kvcoll_latest_flip_time | where object_state!=kvcoll_object_state """ ) # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } delta_records = [] delta_records_keys = set() delta_records_objects = set() delta_records_dict = {} try: reader = run_splunk_search( self.service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): delta_records.append(item) delta_records_keys.add(item.get("keyid")) delta_records_objects.add(item.get("object")) delta_records_dict[item.get("keyid")] = item except Exception as e: msg = f'main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) # # END raw collections records: Get raw collection records using a Splunk search # # # Handle delta records # inspectcollection_compare_records_start_time = time.time() for item in delta_records: item_key = item.get("keyid") item_object = decode_unicode(item.get("object")) item_alias = item.get("alias") item_object_state = item.get("object_state") item_object_category = item.get("object_category") item_anomaly_reason = item.get("anomaly_reason") item_monitored_state = item.get("monitored_state") item_priority = item.get("priority") # our delta state collection_object_state = item.get("kvcoll_object_state") # previous_anomaly_reason collection_anomaly_reason = item.get( "kvcoll_anomaly_reason", "unknown" ) # previous flip time try: collection_latest_flip_time = float( item.get("kvcoll_latest_flip_time", 0) ) except Exception as e: collection_latest_flip_time = 0 # disruption time disruption_time = 0 # compare the object state with item_object_state using decisionmaker_collection_records_dict using the key # if the object_state value is different, log the issue logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, collection record object state is not consistent, object="{item_object}", object_id="{item_key}", in_collection_object_state="{collection_object_state}", in_result_object_state="{item_object_state}", in_collection_anomaly_reason="{collection_anomaly_reason}"' ) # get the current kvrecord kvrecord_updated = False try: kvrecord = inspect_collection.data.query( query=json.dumps({"_key": item_key}) )[0] # update the kvrecord object_state, status_message and anomaly_reason kvrecord["object_state"] = item_object_state kvrecord["status_message"] = item.get("status_message") kvrecord["anomaly_reason"] = item_anomaly_reason kvrecord["mtime"] = time.time() kvrecord["latest_flip_time"] = time.time() kvrecord["latest_flip_state"] = item_object_state # process the KVstore record update inspect_collection.data.update(item_key, json.dumps(kvrecord)) kvrecord_updated = True logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, collection record object update successfully, object="{item_object}", object_id="{item_key}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to update the KVstore record, object="{item_object}", collection_name="{collection_name}", exception="{str(e)}"' ) # proceeed with next steps if kvrecord_updated: try: # calculate disruption time if current_state is green and previous_state was red if ( item_object_state == "green" and collection_object_state == "red" ): try: disruption_time = round( (time.time() - collection_latest_flip_time), 2, ) except Exception as e: disruption_time = 0 flip_timestamp = time.strftime( "%d/%m/%Y %H:%M:%S", time.localtime(time.time()), ) disruption_time_str = f', disruption_time="{disruption_time}"' if disruption_time and disruption_time > 0 else "" flip_result = f'{flip_timestamp}, object="{item_object}" has flipped from previous_state="{collection_object_state}" to state="{item_object_state}" with anomaly_reason="{item_anomaly_reason}", previous_anomaly_reason="{collection_anomaly_reason}"{disruption_time_str}' flip_record = { "timeStr": flip_timestamp, "tenant_id": self.tenant_id, "alias": item_alias, "keyid": item_key, "object": item_object, "object_category": item_object_category, "object_state": item_object_state, "object_previous_state": collection_object_state, "priority": item_priority, "latest_flip_time": time.time(), "latest_flip_state": item_object_state, "anomaly_reason": item_anomaly_reason, "result": flip_result, } # add event_id flip_record["event_id"] = hashlib.sha256( json.dumps(flip_record).encode() ).hexdigest() trackme_gen_state( index=tenant_indexes["trackme_summary_idx"], sourcetype="trackme:flip", source="flip_state_change_tracking", event=flip_record, ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, TrackMe flipping event created successfully, record="{json.dumps(flip_record, indent=1)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, object="{item_object}", task="{task_name}", task_instance_id={task_instance_id}, record="{json.dumps(flip_record, indent=1)}", failed to generate a flipping state event with exception="{e}"' ) # # SLA metrics # # create a list for SLA metrics generation sla_metrics_records = [] if item_object_state == "green": object_num_state = 1 elif item_object_state == "red": object_num_state = 2 elif item_object_state == "orange": object_num_state = 3 elif item_object_state == "blue": object_num_state = 4 else: object_num_state = 5 # add to our list sla_metrics_records.append( { "tenant_id": self.tenant_id, "object_id": item_key, "object": item_object, "alias": item_alias, "object_category": item_object_category, "monitored_state": item_monitored_state, "priority": item_priority, "metrics_event": {"object_state": object_num_state}, } ) # call the SLA gen metrics function sla_metrics_gen_start = time.time() try: sla_metrics = trackme_sla_gen_metrics( self.tenant_id, tenant_indexes.get("trackme_metric_idx"), sla_metrics_records, ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function trackme_sla_gen_metrics success {sla_metrics}, run_time={round(time.time()-sla_metrics_gen_start, 3)}, no_entities={len(sla_metrics_records)}' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function trackme_sla_gen_metrics failed with exception {str(e)}' ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no_delta_records="{len(delta_records_keys)}", run_time="{round((time.time() - inspectcollection_compare_records_start_time), 3)}", collection="{inspect_collection_name}"' ) # # END comparison # # end subtask logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.' ) # # # # # Call the trackme_register_tenant_component_summary # # Use threading to do an async call to the register summary without waiting for it to complete thread = threading.Thread( target=self.register_component_summary_async, args=( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, component, ), ) thread.start() logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, trackme_register_tenant_component_summary was requested.' ) # # task: untracked_entities # # # splk-dsm - untracked entities # # context: this activity tracks and maintain state for untracked entities # untracked entities are entities which are entirely out of the scope of any trackers, and therefore not maintained otherwise task_instance_id = self.get_uuid() task_name = "untracked_entities" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) if vtenant_record.get("tenant_dsm_enabled") == True: component = "dsm" logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting untracked entities now.' ) # kwargs kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } untracked_entities_count = 0 untracked_entities_processed_objects = [] untracked_entities_search = f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval key=_key ``` target any entity that has not been updated since more than 15m ``` | eval time_sec_since_inspection=now()-tracker_runtime | where ( time_sec_since_inspection>900 OR isnull(tracker_runtime) ) ``` called the offline abstract macro version ``` `trackme_{component}_tracker_abstract({self.tenant_id})` ``` collects latest collection state into the summary index ``` | `trackme_collect_state("current_state_tracking:splk-{component}:{self.tenant_id}", "object", "{self.tenant_id}")` ``` output flipping change status if changes ``` | trackmesplkgetflipping tenant_id="{self.tenant_id}" object_category="splk-{component}" ``` update the KVstore collection ``` | `trackme_outputlookup_tracker_health(trackme_{component}_tenant_{self.tenant_id}, key)` ``` update the delay metric only ``` | `trackme_mcollect(object, splk-{component}, "metric_name:trackme.splk.feeds.lag_event_sec=data_last_lag_seen", "tenant_id, object_category, object", "{self.tenant_id}")` ``` summarize job ``` | stats count as report_entities_count, values(object) as objects by tenant_id """ # run the main report, every result is a Splunk search to be executed on its own thread try: reader = run_splunk_search( self.service, untracked_entities_search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): untracked_entities_count += 1 logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities_count="{len(item)}"' ) untracked_entities_processed_objects = item.get("objects", []) if untracked_entities_count == 0: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no untracked entities currently.' ) except Exception as e: # Call the component register trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, "all", report_name, "failure", time.time(), str(time.time() - start), str(e), "-5m", "now", ) msg = f'task="{task_name}", task_instance_id={task_instance_id}, tenant_id="{self.tenant_id}", main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) if untracked_entities_processed_objects: handler_events_records = [] for object_name in untracked_entities_processed_objects: handler_events_records.append( { "object": object_name, "object_category": f"splk-{component}", "object_id": hashlib.sha256( object_name.encode("utf-8") ).hexdigest(), "handler": "health_tracker:untracked_entities", "handler_message": "Entity was inspected by the heath tracker, it is out of the scope of any hybrid tracker due to high delay and/or latency.", "handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth tenant_id={self.tenant_id} component=splk-{component} task=untracked_entities", "handler_time": time.time(), } ) # notification event try: trackme_handler_events( session_key=self._metadata.searchinfo.session_key, splunkd_uri=self._metadata.searchinfo.splunkd_uri, tenant_id=self.tenant_id, sourcetype="trackme:handler", source=f"trackme:handler:{self.tenant_id}", handler_events=handler_events_records, ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="splk-{component}", could not send notification event, exception="{e}"' ) # # splk-dhm - untracked entities # # context: this activity tracks and maintain state for untracked entities # untracked entities are entities which are entirely out of the scope of any trackers, and therefore not maintained otherwise if vtenant_record.get("tenant_dhm_enabled") == True: component = "dhm" logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting untracked entities now.' ) # kwargs kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } untracked_entities_count = 0 untracked_entities_processed_objects = [] untracked_entities_search = f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval key=_key ``` target any entity that has not been updated since more than 15m ``` | eval time_sec_since_inspection=now()-tracker_runtime | where ( time_sec_since_inspection>900 OR isnull(tracker_runtime) ) ``` called the offline abstract macro version ``` `trackme_{component}_tracker_abstract({self.tenant_id})` ``` collects latest collection state into the summary index ``` | `trackme_collect_state("current_state_tracking:splk-{component}:{self.tenant_id}", "object", "{self.tenant_id}")` ``` output flipping change status if changes ``` | trackmesplkgetflipping tenant_id="{self.tenant_id}" object_category="splk-{component}" ``` update the KVstore collection ``` | `trackme_outputlookup_tracker_health(trackme_{component}_tenant_{self.tenant_id}, key)` ``` update the delay metric only ``` | `trackme_mcollect(object, splk-{component}, "metric_name:trackme.splk.feeds.lag_event_sec=data_last_lag_seen", "tenant_id, object_category, object", "{self.tenant_id}")` ``` summarize job ``` | stats count as report_entities_count, values(object) as objects by tenant_id """ # run the main report, every result is a Splunk search to be executed on its own thread try: reader = run_splunk_search( self.service, untracked_entities_search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): untracked_entities_count += 1 logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities_count="{len(item)}"' ) untracked_entities_processed_objects = item.get("objects", []) if untracked_entities_count == 0: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no untracked entities currently.' ) except Exception as e: # Call the component register trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, "all", report_name, "failure", time.time(), str(time.time() - start), str(e), "-5m", "now", ) msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) if untracked_entities_processed_objects: # if untracked_entities_processed_objects is a string (a single object was reported), convert it to a list if isinstance(untracked_entities_processed_objects, str): untracked_entities_processed_objects = [ untracked_entities_processed_objects ] handler_events_records = [] for object_name in untracked_entities_processed_objects: handler_events_records.append( { "object": object_name, "object_id": hashlib.sha256( object_name.encode("utf-8") ).hexdigest(), "object_category": f"splk-{component}", "handler": "health_tracker:untracked_entities", "handler_message": "Entity was inspected by the heath tracker, it is out of the scope of any hybrid tracker due to high delay and/or latency.", "handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth tenant_id={self.tenant_id} component=splk-{component} task=untracked_entities", "handler_time": time.time(), } ) # notification event try: trackme_handler_events( session_key=self._metadata.searchinfo.session_key, splunkd_uri=self._metadata.searchinfo.splunkd_uri, tenant_id=self.tenant_id, sourcetype="trackme:handler", source=f"trackme:handler:{self.tenant_id}", handler_events=handler_events_records, ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="splk-{component}", could not send notification event, exception="{e}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: duplicated_entities # task_instance_id = self.get_uuid() task_name = "duplicated_entities" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # all components except splk-wlk # # context: this situation is not expected, but if we have duplicated entities, we need to verify and purge them # splk-wlk - duplicated entities # # context: this activity tracks for duplicated entities in the Workload component # under some rare circumstances, the Splunk scheduler logs may lack the user context, althrough we implement several safeties # if this happens, we need to verify and purge any duplicated entity with the system user context instead of the proper user context for component in ("dsm", "dhm", "mhm", "wlk", "flx", "fqm"): if ( vtenant_record.get(f"tenant_{component}_enabled") == True and vtenant_record.get("tenant_replica") == False ): logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting entities now.' ) # kwargs kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } duplicated_entities_count = 0 duplicated_entities_list = [] # specific search for wlk if component == "wlk": duplicated_entities_search = remove_leading_spaces( f"""\ | inputlookup trackme_wlk_tenant_{self.tenant_id} | eval keyid=_key | fields keyid, account, app, user, savedsearch_name, object, last_seen | eventstats count as dcount by account, app, savedsearch_name | where dcount>1 | sort - 0 savedsearch_name, last_seen """ ) else: # other components duplicated_entities_search = remove_leading_spaces( f"""\ | inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key | sort 0 object | eventstats count as dcount by object | streamstats count as rank by object | where dcount>1 ``` handle rank if the duplicated is due to FIPS migration ``` | eval rank=if(len(keyid) == 64, 2, 1) | where rank=1 """ ) # run the main report, every result is a Splunk search to be executed on its own thread logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, executing search="{duplicated_entities_search}"' ) try: reader = run_splunk_search( self.service, duplicated_entities_search, kwargs_oneshot, 24, 5, ) for item in reader: if isinstance(item, dict): duplicated_entities_count += 1 duplicated_entities_list.append(item.get("keyid")) logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, detected duplicated entity, keyid="{item.get("keyid")}", object="{item.get("object")}"' ) if duplicated_entities_count == 0: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no duplicated entities currently.' ) except Exception as e: # Call the component register trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, "all", report_name, "failure", time.time(), str(time.time() - start), str(e), "-5m", "now", ) msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) # process if needed if duplicated_entities_count > 0: # target if component == "dsm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_delete" elif component == "dhm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dhm/write/dh_delete" if component == "mhm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_mhm/write/mh_delete" if component == "flx": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_flx/write/flx_delete" if component == "fqm": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_fqm/write/fqm_delete" if component == "wlk": target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_wlk/write/wlk_delete" # data # turn duplicated_entities_list into a comma separated string # update comment if component == "wlk": update_comment = "One or more duplicated entities were detected by the health tracker, this condition can happen when Splunk scheduler logs lack the user context, automated purge of these entities." else: update_comment = "One or more duplicated entities were detected by the health tracker, this condition is not expected and TrackMe needs to purge duplicates to avoid further issues." duplicated_entities_list = ",".join(duplicated_entities_list) post_data = { "tenant_id": self.tenant_id, "keys_list": duplicated_entities_list, "deletion_type": "temporary", "update_comment": update_comment, } try: response = session.post( target_url, data=json.dumps(post_data), verify=False, timeout=600, ) msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicated entities purge successful, results="{json.dumps(response.json(), indent=2)}"' logging.info(msg) except Exception as e: msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicated entities purge failed with exception="{str(e)}"' logging.info(msg) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: check_trackers_collections # # this task is designed to verify that trackers referenced in the dedicated collections are still present in the system # if not, it will remove the tracker from the collection task_instance_id = self.get_uuid() task_name = "check_trackers_collections" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) def check_trackers_existence(vtenant_record, component): logging.info(f"Checking tracker definitions for component: {component}") # Load the tracker collection associated with the component (source of truth) tracker_collection_name = ( f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}" ) tracker_collection = self.service.kvstore[tracker_collection_name] # Get all the tracker records tracker_records = tracker_collection.data.query() for tracker_record in tracker_records: record_knowledge_objects = json.loads( tracker_record.get("knowledge_objects", "{}") ) # get the reports list reports_list = record_knowledge_objects.get("reports", []) # identify the main tracker (tracker_main_name) which contains _tracker_tenant_ in the name tracker_main_name = None for report_name in reports_list: if "_tracker_tenant_" in report_name: tracker_main_name = report_name break # Verify the existence of the main tracker, if it cannot be found in the system, the entire record will be removed from the collection purge_tracker_record = False # the main tracker was found in the record if tracker_main_name: # process savedsearch_definition = None try: savedsearch = self.service.saved_searches[tracker_main_name] savedsearch_definition = savedsearch.content["search"] savedsearch_content = savedsearch.content except Exception as e: savedsearch_definition = None savedsearch_content = {} # purge if necessary if not savedsearch_definition: purge_tracker_record = True logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the main tracker="{tracker_main_name}" does not exist anymore, the tracker record will be removed from the collection.' ) else: # the main tracker was not found in the record, the record is considered as invalid and will be removed from the collection purge_tracker_record = True logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record="{tracker_record}" is invalid, the tracker record will be removed from the collection.' ) # purge if necessary if purge_tracker_record: try: tracker_collection.data.delete( json.dumps({"_key": tracker_record.get("_key")}) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record was successfully removed from the collection.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record failed to be removed from the collection, exception="{str(e)}"' ) def recreate_missing_tracker_records(vtenant_record, component): """ Recreate hybrid tracker records in dedicated KVstore if they exist in tenant__hybrid_objects but are missing from the dedicated collection. """ logging.info(f"Checking for missing tracker records to recreate for component: {component}") # Load the tenant hybrid objects from vtenant_record (central source) hybrid_objects_json = vtenant_record.get(f"tenant_{component}_hybrid_objects") if not hybrid_objects_json: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, No hybrid objects found in vtenant_record for component "{component}", skipping recreation check.' ) return try: hybrid_objects = json.loads(hybrid_objects_json) except Exception as e: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to parse hybrid_objects JSON, exception="{str(e)}"' ) return reports_list = hybrid_objects.get("reports", []) macros_list = hybrid_objects.get("macros", []) if not reports_list: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, No reports found in hybrid_objects for component "{component}", skipping recreation check.' ) return # Load the dedicated tracker collection tracker_collection_name = ( f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}" ) tracker_collection = self.service.kvstore[tracker_collection_name] # Get existing tracker records from dedicated collection existing_tracker_records = tracker_collection.data.query() existing_tracker_names = set() for record in existing_tracker_records: tracker_name = record.get("tracker_name") if tracker_name: existing_tracker_names.add(tracker_name) # Process wrapper reports to extract tracker names # Pattern: trackme__hybrid__wrapper_tenant_ wrapper_prefix = f"trackme_{component}_hybrid_" wrapper_suffix = f"_wrapper_tenant_{self.tenant_id}" # Track trackers we've already processed to avoid duplicates processed_trackers = {} for report_name in reports_list: # Only process wrapper reports to identify trackers if "_wrapper_" not in report_name: continue # Extract tracker_name from wrapper report name # Pattern: trackme__hybrid__wrapper_tenant_ if report_name.startswith(wrapper_prefix) and report_name.endswith(wrapper_suffix): # Remove prefix and suffix to get tracker_name tracker_name = report_name[len(wrapper_prefix):-len(wrapper_suffix)] # Check if this tracker exists in the dedicated collection if tracker_name not in existing_tracker_names and tracker_name not in processed_trackers: # Collect all reports and macros for this tracker tracker_reports = [] tracker_macros = [] # Find all reports that belong to this tracker # Use explicit expected report name construction for precise matching # This avoids issues with reserved words (abstract, wrapper, tracker) and substring matches # Reports patterns vary by component: # - Components with abstract (dsm, dhm, mhm): # * trackme__hybrid_abstract__tenant_ # * trackme__hybrid__wrapper_tenant_ # * trackme__hybrid__tracker_tenant_ # - Components without abstract (flx, wlk, fqm): # * trackme__hybrid__wrapper_tenant_ # * trackme__hybrid__tracker_tenant_ # Construct expected report names explicitly for exact matching expected_reports = [] # Components with abstract reports: dsm, dhm, mhm if component in ["dsm", "dhm", "mhm"]: expected_reports.append(f"trackme_{component}_hybrid_abstract_{tracker_name}_tenant_{self.tenant_id}") # All components have wrapper and tracker reports expected_reports.append(f"trackme_{component}_hybrid_{tracker_name}_wrapper_tenant_{self.tenant_id}") expected_reports.append(f"trackme_{component}_hybrid_{tracker_name}_tracker_tenant_{self.tenant_id}") # Match reports using exact names for report in reports_list: if report in expected_reports: tracker_reports.append(report) # Find all macros that belong to this tracker # Note: Macros are only applicable to dsm, dhm, mhm components # Macro pattern: trackme__hybrid_root_constraint__tenant_ # Use exact expected macro name for matching (similar to reports above) if component in ["dsm", "dhm", "mhm"]: expected_macro = f"trackme_{component}_hybrid_root_constraint_{tracker_name}_tenant_{self.tenant_id}" if expected_macro in macros_list: tracker_macros.append(expected_macro) # Only proceed if we have at least one report if tracker_reports: processed_trackers[tracker_name] = { "reports": tracker_reports, "macros": tracker_macros } # Recreate missing tracker records for tracker_name, knowledge_data in processed_trackers.items(): logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Recreating missing tracker record for "{tracker_name}" in dedicated collection.' ) # Build knowledge_objects structure (without properties as per requirement) knowledge_objects = { "reports": knowledge_data["reports"] } # Add macros if present (only for components that use them) if knowledge_data["macros"]: knowledge_objects["macros"] = knowledge_data["macros"] # Create the tracker record new_tracker_record = { "_key": hashlib.sha256(tracker_name.encode("utf-8")).hexdigest(), "tracker_name": tracker_name, "knowledge_objects": json.dumps(knowledge_objects, indent=2), "created_time": time.time(), "created_by": "health_tracker" } # Add component-specific fields if component == "wlk": # wlk tracker records require tracker_type field # tracker_name format is: {tracker_type}_{uuid} # Extract tracker_type from tracker_name # Note: Some tracker types contain underscores (e.g., inactive_entities, splunkcloud_svc) # so we need to check for multi-word types first before falling back to simple split valid_wlk_tracker_types = [ "main", "introspection", "scheduler", "metadata", "orphan", "inactive_entities", "splunkcloud_svc", "notable" ] extracted_tracker_type = None # First, try to match known multi-word tracker types for valid_type in valid_wlk_tracker_types: if tracker_name.startswith(valid_type + "_") or tracker_name == valid_type: extracted_tracker_type = valid_type break # If no match found, fall back to simple split for single-word types if not extracted_tracker_type and "_" in tracker_name: first_segment = tracker_name.split("_", 1)[0] if first_segment in valid_wlk_tracker_types: extracted_tracker_type = first_segment if extracted_tracker_type: new_tracker_record["tracker_type"] = extracted_tracker_type else: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Could not extract valid tracker_type from tracker_name="{tracker_name}" (expected format: tracker_type_uuid)' ) elif component in ["flx", "fqm"]: # flx and fqm use tracker_id field new_tracker_record["tracker_id"] = tracker_name try: # Final safety check: verify the tracker doesn't exist before insertion final_check = tracker_collection.data.query( query=json.dumps({"tracker_name": tracker_name}) ) if not final_check: tracker_collection.data.insert(json.dumps(new_tracker_record)) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Successfully recreated tracker record for "{tracker_name}" in dedicated collection.' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{tracker_name}" already exists in dedicated collection, skipping recreation.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to recreate tracker record for "{tracker_name}", exception: {str(e)}' ) # Main logic components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"] for component in components: if vtenant_record.get(f"tenant_{component}_enabled"): check_trackers_existence(vtenant_record, component) recreate_missing_tracker_records(vtenant_record, component) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: check_trackers # task_instance_id = self.get_uuid() task_name = "check_trackers_definition" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) def check_trackers_definition(vtenant_record, component): logging.info(f"Checking tracker definitions for component: {component}") # Load the tracker collection associated with the component (source of truth) tracker_collection_name = ( f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}" ) tracker_collection = self.service.kvstore[tracker_collection_name] # Get all the tracker records tracker_records = tracker_collection.data.query() # Initialize empty sets for the reports and macros that should be in the vtenant_record truth_reports = set() truth_macros = set() for tracker_record in tracker_records: record_knowledge_objects = json.loads( tracker_record.get("knowledge_objects", "{}") ) # Collect the reports and macros from the tracker record's knowledge_objects truth_reports.update(record_knowledge_objects.get("reports", [])) truth_macros.update(record_knowledge_objects.get("macros", [])) # Load the current tenant hybrid objects from vtenant_record (destination) hybrid_objects_json = vtenant_record.get( f"tenant_{component}_hybrid_objects" ) if hybrid_objects_json: # Load the JSON object from the hybrid_objects field hybrid_objects = json.loads(hybrid_objects_json) else: # If no existing hybrid_objects, initialize an empty structure hybrid_objects = {"reports": [], "macros": []} vtenant_reports = set(hybrid_objects.get("reports", [])) vtenant_macros = set(hybrid_objects.get("macros", [])) # Compare and find missing reports/macros in the vtenant_record missing_reports = truth_reports - vtenant_reports missing_macros = truth_macros - vtenant_macros # If there are any missing reports or macros, add them to the vtenant_record if missing_reports or missing_macros: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Missing reports: {missing_reports} or macros: {missing_macros} in vtenant_record.' ) # Update the vtenant_record with missing reports and macros hybrid_objects["reports"] = list(vtenant_reports.union(truth_reports)) hybrid_objects["macros"] = list(vtenant_macros.union(truth_macros)) # Save the updated hybrid objects back to the vtenant_record vtenant_record[f"tenant_{component}_hybrid_objects"] = json.dumps( hybrid_objects, indent=2 ) try: self.service.kvstore["kv_trackme_virtual_tenants"].data.update( str(vtenant_record["_key"]), json.dumps(vtenant_record) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}' ) def check_trackers_existence_in_dedicated_kvstore(vtenant_record, component): logging.info(f"Checking tracker existence in dedicated KVstore for component: {component}") # Load the central KVstore collection to get all tracker records central_collection_name = f"kv_trackme_{component}_tenant_{self.tenant_id}" try: central_collection = self.service.kvstore[central_collection_name] central_records = central_collection.data.query() except Exception as e: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Central collection "{central_collection_name}" not found or accessible, exception: {str(e)}' ) return # Load the dedicated tracker collection tracker_collection_name = ( f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}" ) tracker_collection = self.service.kvstore[tracker_collection_name] # Get existing tracker records from dedicated collection existing_tracker_records = tracker_collection.data.query() existing_tracker_names = set() for record in existing_tracker_records: tracker_name = record.get("tracker_name") if tracker_name: existing_tracker_names.add(tracker_name) # Track tracker names being processed in this batch to prevent duplicates processing_tracker_names = set() # Process each central record to find tracker names for central_record in central_records: tracker_name = central_record.get("tracker_name") if not tracker_name: continue # Check if tracker_name is a JSON array (concurrent tracker format) # If it's a JSON array, skip it - these are normalized tracker names, not full report names # We only process full report names that match the hybrid pattern try: if isinstance(tracker_name, str): parsed_tracker_name = json.loads(tracker_name) if isinstance(parsed_tracker_name, list): # This is a JSON array of normalized tracker names, skip it # These are from concurrent trackers and don't need hybrid tracker records continue except (json.JSONDecodeError, TypeError): # Not a JSON array, continue processing as a string pass # Extract the base tracker name by removing _wrapper_tenant_ or _tracker_tenant_ suffix base_tracker_name = None if "_wrapper_tenant_" in tracker_name: base_tracker_name = tracker_name.split("_wrapper_tenant_")[0] elif "_tracker_tenant_" in tracker_name: base_tracker_name = tracker_name.split("_tracker_tenant_")[0] if not base_tracker_name: continue # Remove the trackme__hybrid_ prefix to get the actual tracker name # This applies to all components that follow this naming convention expected_prefix = f"trackme_{component}_hybrid_" if base_tracker_name.startswith(expected_prefix): actual_tracker_name = base_tracker_name.replace(expected_prefix, "", 1) else: actual_tracker_name = base_tracker_name # Check if this tracker exists in the dedicated collection (by name or ID) # Also check if we're already processing this tracker name in this batch if (actual_tracker_name not in existing_tracker_names and actual_tracker_name not in processing_tracker_names): # Add to processing set to prevent duplicates in this batch processing_tracker_names.add(actual_tracker_name) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{actual_tracker_name}" not found in dedicated collection, creating record.' ) # Create a new tracker record in the dedicated collection # Build knowledge_objects with both wrapper and tracker reports reports_list = [] # Add both wrapper and tracker reports wrapper_name = tracker_name.replace("_tracker_tenant_", "_wrapper_tenant_") reports_list = [wrapper_name, tracker_name] # Build knowledge_objects structure knowledge_objects = { "reports": reports_list } # Macros are only applicable to dsm, dhm, mhm components if component in ["dsm", "dhm", "mhm"]: # Extract the tracker identifier from the base tracker name # Example: trackme_dsm_hybrid_tracker-iew8hkxv -> tracker-iew8hkxv if "_hybrid_" in base_tracker_name: tracker_identifier = base_tracker_name.split("_hybrid_")[1] macro_name = f"trackme_{component}_hybrid_root_constraint_{tracker_identifier}_tenant_{self.tenant_id}" knowledge_objects["macros"] = [macro_name] new_tracker_record = { "tracker_name": actual_tracker_name, "tracker_id": actual_tracker_name, # tracker_id should equal tracker_name "knowledge_objects": json.dumps(knowledge_objects, indent=2), "created_time": time.time(), "created_by": "health_tracker" } try: # Final safety check: verify the tracker doesn't exist before insertion final_check = tracker_collection.data.query(query=json.dumps({"tracker_name": actual_tracker_name})) if not final_check: tracker_collection.data.insert(json.dumps(new_tracker_record)) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Successfully created tracker record for "{actual_tracker_name}" in dedicated collection.' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{actual_tracker_name}" already exists in dedicated collection, skipping creation.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to create tracker record for "{actual_tracker_name}", exception: {str(e)}' ) # Main logic components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"] for component in components: if vtenant_record.get(f"tenant_{component}_enabled"): check_trackers_definition(vtenant_record, component) check_trackers_existence_in_dedicated_kvstore(vtenant_record, component) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: check_alerts_definition # task_instance_id = self.get_uuid() task_name = "check_alerts_definition" task_start = time.time() # # Verify for each tenant record the content of tenant_alert_objects # - load the tenant_alert_objects object # - For each alert, verify that the alert exists in the system # - if not, remove the alert from the tenant_alert_objects object and update the record # def check_alerts_definition(alert_name): # get the current search definition try: alert_current = self.service.saved_searches[alert_name] alert_current_search = alert_current.content.get("search") return True except Exception as e: return False # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # Load the tenant_alert_objects object tenant_alert_objects = vtenant_record.get("tenant_alert_objects", {}) if tenant_alert_objects: try: tenant_alert_objects = json.loads(tenant_alert_objects) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to load tenant_alert_objects, exception: {str(e)}' ) tenant_alert_objects = {} # alerts is a list stored in "alerts" key alerts = tenant_alert_objects.get("alerts", []) # verify each alert alerts_were_removed = False for alert_name in alerts: alert_exists = check_alerts_definition(alert_name) if not alert_exists: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, alert="{alert_name}" not found in saved searches, will be removed from tenant_alert_objects' ) alerts.remove(alert_name) if not alerts_were_removed: alerts_were_removed = True # save the updated tenant_alert_objects if alerts_were_removed: tenant_alert_objects["alerts"] = alerts # save the updated tenant_alert_objects vtenant_record["tenant_alert_objects"] = json.dumps( tenant_alert_objects, indent=2 ) try: self.service.kvstore["kv_trackme_virtual_tenants"].data.update( str(vtenant_record["_key"]), json.dumps(vtenant_record) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: logical_groups # task_instance_id = self.get_uuid() task_name = "check_logical_groups" task_start = time.time() # # Verify Logical Groups: # - load the logical groups KVstore collection # - verify that for each member of the groups, the member can be found in in any of the dsm/dhm/mhm/flx/fqm KVstore collection as an actively monitoreed entity # - if not, purge the member from the group # def query_kvstore_for_object(member, collection_suffix): target_collection_name = ( f"kv_trackme_{collection_suffix}_tenant_{self.tenant_id}" ) target_collection = self.service.kvstore[target_collection_name] query_string = { "$and": [ { "object": member, "monitored_state": "enabled", } ] } try: kvrecord = target_collection.data.query(query=json.dumps(query_string))[ 0 ] kvrecord_key = kvrecord.get("_key", None) except: kvrecord_key = None if kvrecord_key: logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, member="{member}", found in KVstore collection="{target_collection_name}"' ) return True return False if ( vtenant_record.get("tenant_dsm_enabled") == True or vtenant_record.get("tenant_dhm_enabled") == True or vtenant_record.get("tenant_mhm_enabled") == True or vtenant_record.get("tenant_flx_enabled") == True or vtenant_record.get("tenant_fqm_enabled") == True ): # log start logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting to verify logical groups, any orphan logical group member will be purged automatically.' ) # time counter logical_group_check_start = time.time() # # Logical groups collection records # logical_group_coll = self.service.kvstore[ f"kv_trackme_common_logical_group_tenant_{self.tenant_id}" ] ( logical_groups_coll_records, logical_groups_by_group_key_dict, logical_groups_by_group_name_list, logical_groups_by_member_dict, logical_groups_by_member_list, ) = get_logical_groups_collection_records(logical_group_coll) # log all returned from the function logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_groups_coll_records={json.dumps(logical_groups_coll_records, indent=2)}, logical_groups_by_group_key_dict={json.dumps(logical_groups_by_group_key_dict, indent=2)}, logical_groups_by_group_name_list={json.dumps(logical_groups_by_group_name_list, indent=2)}, logical_groups_by_member_dict={json.dumps(logical_groups_by_member_dict, indent=2)}, logical_groups_by_member_list={json.dumps(logical_groups_by_member_list, indent=2)}' ) # loops through logical_groups_by_member_list if not empty, then check in each KVstore collection if we have a match logical_member_found = False logical_members_orphans = [] # ensure logical_groups_by_member_list is a list if isinstance(logical_groups_by_member_list, str): logical_groups_by_member_list = [logical_groups_by_member_list] if len(logical_groups_by_member_list) > 0: # # Orphans # for member in logical_groups_by_member_list: for tenant_setting, collection_suffix in [ ("tenant_dsm_enabled", "dsm"), ("tenant_dhm_enabled", "dhm"), ("tenant_mhm_enabled", "mhm"), ("tenant_flx_enabled", "flx"), ("tenant_fqm_enabled", "fqm"), ]: if vtenant_record.get(tenant_setting) == True: logical_member_found = query_kvstore_for_object( member, collection_suffix ) if logical_member_found: break if not logical_member_found: logical_members_orphans.append(member) # log orphans logging.debug( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_members_orphans={json.dumps(logical_members_orphans, indent=2)}' ) # purge orphans if len(logical_members_orphans) > 0: # turn the list into a comma separated string logical_members_orphans = ",".join(logical_members_orphans) try: logical_group_purge_remove_response = ( logical_group_remove_object_from_groups( self._metadata.searchinfo.splunkd_uri, self._metadata.searchinfo.session_key, self.tenant_id, logical_members_orphans, ) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, orphan_members="{logical_members_orphans}", successfully purged the logical groups collection, response="{json.dumps(logical_group_purge_remove_response, indent=2)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, orphan_members="{logical_members_orphans}", failed to purge from the logical groups collection, exception={str(e)}' ) # # empty groups # for logical_group_record in logical_groups_coll_records: # get the group name object_group_name = logical_group_record.get("object_group_name") # get the members members = logical_group_record.get("object_group_members", None) if members: if not len(members) > 0: members = None if not members: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", group has no members, will be purged.' ) try: logical_group_delete_response = ( logical_group_delete_group_by_name( self._metadata.searchinfo.splunkd_uri, self._metadata.searchinfo.session_key, self.tenant_id, object_group_name, ) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", group has been purged successfully, response="{json.dumps(logical_group_delete_response, indent=2)}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", failed to purge the group, exception={str(e)}' ) # log time logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_groups_check_duration="{round(time.time() - logical_group_check_start, 3)}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: check_trackers # task_instance_id = self.get_uuid() task_name = "check_trackers_statuses" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # Set the query health_search = remove_leading_spaces( f""" | trackme mode=post url=/services/trackme/v2/configuration/get_tenant_ops_status body=\"{{'mode': 'raw', 'tenant_id': '{self.tenant_id}'}}\" | trackmeopsstatusexpand """ ) # logging logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id="{self.tenant_id}", Starting health_search, report="{report_name}", search="{health_search}"' ) # kwargs kwargs_oneshot = { "earliest_time": "-5m", "latest_time": "now", "output_mode": "json", "count": 0, } # run the main report, every result is a Splunk search to be executed on its own thread try: reader = run_splunk_search( self.service, health_search, kwargs_oneshot, 24, 5, ) # Call the component register trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, "all", report_name, "success", time.time(), str(time.time() - start), "The report was executed successfully", "-5m", "now", ) for item in reader: if isinstance(item, dict): # verify the knowledge object - if for some reason it is not existing anymore, we should remove it # and not take it into account any longer # process savedsearch_definition = None report_name = item.get("report") try: savedsearch = self.service.saved_searches[report_name] savedsearch_definition = savedsearch.content["search"] savedsearch_content = savedsearch.content except Exception as e: savedsearch_definition = None savedsearch_content = {} logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the report="{report_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.' ) if not savedsearch_definition: # extract component component = report_name.split("_")[1] # purge try: delete_register_summary = ( trackme_delete_tenant_object_summary( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, f"splk-{component}", report_name, ) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, knowledge for the report="{report_name}" was purged successfully, response="{delete_register_summary}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while calling function trackme_delete_tenant_object_summary, exception="{str(e)}"' ) else: search_component = item.get("component") search_cron_schedule = savedsearch_content.get("cron_schedule") search_description = savedsearch_content.get("description") search_earliest = savedsearch_content.get( "dispatch.earliest_time" ) search_last_duration = item.get("last_duration") search_last_exec = item.get("last_exec") search_last_result = item.get("last_result") search_last_status = item.get("last_status") search_latest = savedsearch_content.get("dispatch.latest_time") search_report_name = report_name search_schedule_window = savedsearch_content.get( "schedule_window" ) search_tenant_id = item.get("tenant_id") search_workload_pool = savedsearch_content.get( "workload_pool", None ) # ACLs acl_report_info = None if self.get_acl: # try to get acl acl_link = savedsearch.links["alternate"] acl_report_info = {} acl_url = f"{self._metadata.searchinfo.splunkd_uri}{acl_link}/acl/list?output_mode=json" try: response = session.get( acl_url, verify=False, timeout=600, ) response_json = response.json() response.raise_for_status() acl_properties = response_json["entry"][0].get( "acl", {} ) acl_report_info = { "eai:acl.owner": acl_properties.get("owner"), "eai:acl.perms.read": acl_properties["perms"][ "read" ], "eai:acl.perms.write": acl_properties["perms"][ "write" ], "eai:acl.sharing": acl_properties.get("sharing"), } except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while trying to get the ACL for the report="{report_name}", exception="{str(e)}"' ) # set info record search_info_record = { "component": search_component, "cron_schedule": search_cron_schedule, "description": search_description, "earliest": search_earliest, "last_duration": search_last_duration, "last_exec": search_last_exec, "last_result": search_last_result, "last_status": search_last_status, "latest": search_latest, "report": search_report_name, "schedule_window": search_schedule_window, "tenant_id": search_tenant_id, } # most often the workload pool is not set, only add if explicitly set if search_workload_pool: search_info_record["workload_pool"] = search_workload_pool # add acl info if acl_report_info: search_info_record.update(acl_report_info) yield { "_time": time.time(), "_raw": search_info_record, "component": search_component, "cron_schedule": search_cron_schedule, "description": search_description, "earliest": search_earliest, "last_duration": search_last_duration, "last_exec": search_last_exec, "last_result": search_last_result, "last_status": search_last_status, "latest": search_latest, "report": search_report_name, "schedule_window": search_schedule_window, "tenant_id": search_tenant_id, "workload_pool": search_workload_pool, } # index the audit record try: trackme_state_event( session_key=self._metadata.searchinfo.session_key, splunkd_uri=self._metadata.searchinfo.splunkd_uri, tenant_id=self.tenant_id, index=tenant_indexes["trackme_audit_idx"], sourcetype="trackme:health", source=f"trackme:health:{self.tenant_id}", record=search_info_record, ) except Exception as e: error_msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while calling function trackme_state_event, exception="{str(e)}"' logging.error(error_msg) raise Exception(error_msg) except Exception as e: # Call the component register trackme_register_tenant_object_summary( session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id, "all", report_name, "failure", time.time(), str(time.time() - start), str(e), "-5m", "now", ) msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: check_tenant_record_knowledge_objects # task_instance_id = self.get_uuid() task_name = "check_tenant_record_knowledge_objects" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # logic: # For each component, check the field tenant__hybrid_objects from the vtenant record # load the object as json, get the list reports and the list macros # for each object, check that it actually exists in Splunk # if not, delete the object from the vtenant record for component in ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]: # if the component is disabled, skip try: component_enablement = int(vtenant_record.get(f"tenant_{component}_enabled", 0)) except Exception as e: component_enablement = 0 if component_enablement == 0: continue # get the hybrid_objects field hybrid_objects = vtenant_record.get( f"tenant_{component}_hybrid_objects" ) try: hybrid_objects = json.loads(hybrid_objects) except Exception as e: hybrid_objects = {} # if the field does not exist, skip if not hybrid_objects: continue # if "reports" is in the list, get the list of reports if "reports" in hybrid_objects: reports = hybrid_objects.get("reports") else: reports = [] # if "macros" is in the list, get the list of macros if "macros" in hybrid_objects: macros = hybrid_objects.get("macros") else: macros = [] # check reports if reports: for report_name in reports: # process savedsearch_definition = None try: savedsearch = self.service.saved_searches[report_name] savedsearch_definition = savedsearch.content["search"] savedsearch_content = savedsearch.content except Exception as e: savedsearch_definition = None savedsearch_content = {} # purge if necessary if not savedsearch_definition: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the report="{report_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.' ) # remove from list in hybrid_objects, udate the vetant record and update the KVstore collection reports.remove(report_name) hybrid_objects["reports"] = reports vtenant_record[f"tenant_{component}_hybrid_objects"] = ( json.dumps(hybrid_objects, indent=2) ) try: self.service.kvstore[ "kv_trackme_virtual_tenants" ].data.update( str(vtenant_record["_key"]), json.dumps(vtenant_record) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}' ) # check macros if macros: for macro_name in macros: # process macro_definition = None try: macro = self.service.confs["macros"][macro_name] macro_definition = macro.content["definition"] except Exception as e: macro = None macro_definition = None # purge if necessary if not macro_definition: logging.warning( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the macro="{macro_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.' ) # remove from list in hybrid_objects, udate the vetant record and update the KVstore collection macros.remove(macro_name) hybrid_objects["macros"] = macros vtenant_record[f"tenant_{component}_hybrid_objects"] = ( json.dumps(hybrid_objects, indent=2) ) try: self.service.kvstore[ "kv_trackme_virtual_tenants" ].data.update( str(vtenant_record["_key"]), json.dumps(vtenant_record) ) logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: gen_sla_breaches_events # task_instance_id = self.get_uuid() task_name = "gen_sla_breaches_events" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # get sla gen events frequency try: sla_breaches_events_frequency = int( reqinfo["trackme_conf"]["sla"]["sla_breaches_events_frequency"] ) except Exception as e: sla_breaches_events_frequency = 86400 def process_sla_breaches_component(component, sla_breaches_events_frequency): logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, processing SLA breaches.' ) # Get the KVstore collection for SLA notifications collection_name = ( f"kv_trackme_{component}_sla_notifications_tenant_{self.tenant_id}" ) collection = self.service.kvstore[collection_name] # Run the search to get objects with SLA breaches search_string = f'| trackmegetcoll tenant_id="{self.tenant_id}" component="{component}" | where monitored_state="enabled" | table alias object object_category object_state priority keyid sla_* anomaly_reason status_message | where sla_is_breached=1' # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } try: search_results = run_splunk_search( self.service, search_string, kwargs_search, 24, 5, ) for item in search_results: if isinstance(item, dict): try: # Extract required fields alias = item.get("alias") object_value = item.get("object") object_category = item.get("object_category") object_state = item.get("object_state") priority = item.get("priority") keyid = item.get("keyid") anomaly_reason = item.get("anomaly_reason") status_message = item.get("status_message") sla_class = item.get("sla_class") sla_is_breached = item.get("sla_is_breached") sla_message = item.get("sla_message") sla_threshold = item.get("sla_threshold") sla_threshold_duration = item.get("sla_threshold_duration") sla_timer = item.get("sla_timer") sla_timer_duration = item.get("sla_timer_duration") # Check if we have a notification record for this object query_string = {"_key": keyid} try: kvrecord = collection.data.query( query=json.dumps(query_string) )[0] last_notification_time = float(kvrecord.get("mtime", 0)) current_time = time.time() # Only generate event if last notification was > 24 hours ago if ( current_time - last_notification_time > sla_breaches_events_frequency ): should_generate_event = True else: should_generate_event = False except Exception: # No record exists, we should generate an event should_generate_event = True last_notification_time = 0 if should_generate_event: # Create the SLA breach event record breach_record = { "timeStr": time.strftime( "%d/%m/%Y %H:%M:%S", time.localtime(time.time()) ), "tenant_id": self.tenant_id, "alias": alias, "object": decode_unicode(object_value), "keyid": keyid, "object_category": object_category, "object_state": object_state, "priority": priority, "anomaly_reason": anomaly_reason, "status_message": status_message, "sla_class": sla_class, "sla_is_breached": sla_is_breached, "sla_message": sla_message, "sla_threshold": sla_threshold, "sla_threshold_duration": sla_threshold_duration, "sla_timer": sla_timer, "sla_timer_duration": sla_timer_duration, } # Add event_id breach_record["event_id"] = hashlib.sha256( json.dumps(breach_record).encode() ).hexdigest() # Generate the event try: trackme_gen_state( index=tenant_indexes["trackme_summary_idx"], sourcetype="trackme:sla_breaches", source=f"health_tracker:{task_name}", event=breach_record, ) logging.info( f'TrackMe SLA breach event created successfully, tenant_id="{self.tenant_id}", sla_gen_events_frequency="{sla_breaches_events_frequency}", record="{json.dumps(breach_record, indent=1)}"' ) # Update or create the notification record notification_record = { "_key": keyid, "mtime": time.time(), "last_notification": breach_record, } try: collection.data.update( keyid, json.dumps(notification_record) ) except Exception: collection.data.insert( json.dumps(notification_record) ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", object="{object_value}", failed to generate a SLA breach event with exception="{e}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", failed to process record with exception="{e}"' ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", component="splk-{component}", failed to run SLA breaches search with exception="{e}"' ) # Main logic components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"] if ( sla_breaches_events_frequency > 0 ): # only run if the frequency is greater than 0 for component in components: if vtenant_record.get(f"tenant_{component}_enabled"): process_sla_breaches_component( component, sla_breaches_events_frequency ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: unclosed_stateful_incidents # task_instance_id = self.get_uuid() task_name = "unclosed_stateful_incidents" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # objective: get any opened or updated incidents in the KVstore, verify that: # - the entity associated with the incident is still existing and with monitored_state="enabled", if not the incident will be updated and closed. # - if the entity exists, but is in a non alerting state (green, blue), and the incident is older than 24 hours, the incident will be updated and closed. # get the KVstore collection for stateful incidents stateful_incidents_collection_name = ( f"kv_trackme_stateful_alerting_tenant_{self.tenant_id}" ) stateful_incidents_collection = self.service.kvstore[ stateful_incidents_collection_name ] def get_stateful_incidents(collection_name, collection): collection_records = [] collection_records_keys = set() collection_dict = {} try: end = False skip_tracker = 0 while end == False: process_collection_records = collection.data.query( skip=skip_tracker ) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in collection_records_keys: if item.get("alert_status") in ["opened", "updated"]: collection_records.append(item) collection_records_keys.add(item.get("object")) collection_dict[item.get("object")] = item skip_tracker += 500 else: end = True return collection_records, collection_records_keys, collection_dict except Exception as e: raise Exception(str(e)) # get the stateful incidents try: ( stateful_incidents_records, stateful_incidents_keys, stateful_incidents_dict, ) = get_stateful_incidents( stateful_incidents_collection_name, stateful_incidents_collection ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call get_kv_collection, args={stateful_incidents_collection_name}, cannot process this task, exception="{str(e)}"' ) stateful_incidents_records = [] stateful_incidents_keys = set() stateful_incidents_dict = {} # iterate through opened or updated incidents for stateful_incident in stateful_incidents_records: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, inspecting stateful with _key="{stateful_incident.get("_key")}", incident="{stateful_incident}"' ) # get the object stateful_object = stateful_incident.get("object") # get the object_id stateful_object_id = stateful_incident.get("object_id") # get the object_category (ex: splk-dsm) stateful_object_category = stateful_incident.get("object_category") # get the object_state stateful_object_state = stateful_incident.get("object_state") # get the object status stateful_object_status = stateful_incident.get("object_status") # get the mtime stateful_incident_mtime = float(stateful_incident.get("mtime")) # calculate the incident duration stateful_incident_duration = time.time() - stateful_incident_mtime # access the data KVstore collection object_category_suffix = stateful_object_category.split("-")[1] data_collection_name = ( f"kv_trackme_{object_category_suffix}_tenant_{self.tenant_id}" ) data_collection = self.service.kvstore[data_collection_name] # get the object from the data collection try: data_object = data_collection.data.query( query=json.dumps({"_key": stateful_object_id}) )[0] except Exception as e: data_object = None # use-case 1: the object does not exist anymore stateful_object_exists = True if not data_object: stateful_object_exists = False # use-case 2: the object exists, but is in a non alerting state while the incident has not been closed 24 hours later stateful_incident_outdated = False if stateful_object_exists: if data_object.get("object_state", "green") in ["green", "blue"]: if stateful_incident_duration > 86400: stateful_incident_outdated = True elif data_object.get("monitored_state") != "enabled": stateful_incident_outdated = True # Update the incident if necessary if not stateful_object_exists or stateful_incident_outdated: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, update of outdated stateful incident is required, stateful_object_exists="{stateful_object_exists}", stateful_incident_outdated="{stateful_incident_outdated}", incident="{stateful_incident}"' ) # update the incident stateful_incident["alert_status"] = "closed" stateful_incident["mtime"] = time.time() if stateful_object_exists: stateful_incident["object_state"] = stateful_object_status # update the incident in the KVstore try: stateful_incidents_collection.data.update( stateful_incident.get("_key"), json.dumps(stateful_incident) ) except Exception as e: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to update stateful incident with exception="{e}"' ) else: logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no action required against incident with _key="{stateful_incident.get("_key")}", stateful_object_exists="{stateful_object_exists}", stateful_incident_outdated="{stateful_incident_outdated}"' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # # task: apply_licensing_restrictions # task_instance_id = self.get_uuid() task_name = "apply_licensing_restrictions" task_start = time.time() # start task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.' ) # # licensing restriction # # if the component is a restricted component and the product is not registered, it should be disabled now if license_is_valid == 0 and ( vtenant_record.get("tenant_flx_enabled") == 1 or vtenant_record.get("tenant_fqm_enabled") == 1 or vtenant_record.get("tenant_wlk_enabled") == 1 ): logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is running a restricted component while this instance is not registered' ) # target target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant" # data post_data = { "tenant_id": self.tenant_id, "update_comment": "Auto disabling this tenant due to licensing limitation, the tenant is running a restricted component while the product is not currently registered", "force": "true", } try: response = session.post( target_url, data=json.dumps(post_data), verify=False, timeout=600, ) return json.loads(response.text) except Exception as e: raise Exception( f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"' ) elif ( license_is_valid == 0 and license_active_tenants > 2 and self.tenant_id not in license_active_tenants_list[0:2] ): logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is running a restricted component while this instance is not registered' ) # target target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant" # data post_data = { "tenant_id": self.tenant_id, "update_comment": f"Auto disabling this tenant due to licensing limitation, this deployment has reached the maximum number of tenants allowed ({license_active_tenants}), only the following tenants can be used: {license_active_tenants_list[0:2]}", "force": "true", } try: response = session.post( target_url, data=json.dumps(post_data), verify=False, timeout=600, ) return json.loads(response.text) except Exception as e: raise Exception( f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"' ) elif ( license_is_valid == 1 and license_subscription_class == "enterprise" and license_active_tenants > 6 and self.tenant_id not in license_active_tenants_list[0:6] ): logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is over the maximum number of allowed tenants in Enterprise Edition' ) # target target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant" # data post_data = { "tenant_id": self.tenant_id, "update_comment": f"Auto disabling this tenant due to licensing limitation, this deployment has reached the maximum number of tenants allowed ({license_active_tenants}), only the following tenants can be used: {license_active_tenants_list[0:6]}", "force": "true", } try: response = session.post( target_url, data=json.dumps(post_data), verify=False, timeout=600, ) return json.loads(response.text) except Exception as e: raise Exception( f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"' ) # An exception was raised while attempting to validate the license # Log the error but do nothing elif license_is_valid == 2: logging.error( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, an exception was raised while attempting to validate the license, no actions will be taken for now.' ) # end task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.' ) # end general task logging.info( f'tenant_id="{self.tenant_id}", instance_id={instance_id}, trackmetrackerhealth has terminated, total_run_time={round(time.time() - start, 3)}' ) dispatch(HealthTracker, sys.argv, sys.stdin, sys.stdout, __name__)