#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Standard library imports import json import logging import os import sys import time # Third-party library imports import requests import urllib3 from logging.handlers import RotatingFileHandler # Disable insecure request warnings for urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # set splunkhome splunkhome = os.environ["SPLUNK_HOME"] # set logging filehandler = RotatingFileHandler( "%s/var/log/splunk/trackme_stateful.log" % splunkhome, mode="a", maxBytes=10000000, backupCount=1, ) formatter = logging.Formatter( "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s" ) logging.Formatter.converter = time.gmtime filehandler.setFormatter(formatter) log = logging.getLogger() # root logger - Good to get it only once. for hdlr in log.handlers[:]: # remove the existing file handlers if isinstance(hdlr, logging.FileHandler): log.removeHandler(hdlr) log.addHandler(filehandler) # set the new handler # set the log level to INFO, DEBUG as the default is ERROR log.setLevel(logging.INFO) # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # Import Splunk libs from splunklib.searchcommands import ( dispatch, GeneratingCommand, Configuration, Option, validators, ) # Import trackme libs from trackme_libs import trackme_reqinfo, run_splunk_search from trackme_libs_get_data import search_kv_collection_sdkmode import splunklib.client as client # Import trackme libs utils from trackme_libs_utils import decode_unicode, normalize_anomaly_reason, remove_leading_spaces # Import helper functions from stateful alert helper (these are utility functions, not filtering logic) sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "trackme")) from modalert_trackme_stateful_alert_helper import ( get_keyid_from_main_kvstore, get_object_state_from_main_kvstore, get_stateful_records_for_object_id, validate_object_state_via_rest, ) def generate_fields(records): """ Fhis function ensures that records have the same list of fields to allow Splunk to automatically extract these fields If a given result does not have a given field, it will be added to the record as an empty value """ all_keys = set() for record in records: all_keys.update(record.keys()) for record in records: for key in all_keys: if key not in record: record[key] = "" yield record def get_event_filtering_context( helper, service, server_uri, header, event, maintenance_info, alerting_states, ack_collection_keys=None, ack_collection_dict=None, bank_holidays_info=None, object_id_cache=None, object_state_cache=None, stateful_record_cache=None, ): """ Function to gather all context needed for filtering a stateful alert event. This includes object_id, object_state, stateful_record, ack status, etc. Args: helper: The helper object (can be None for custom command usage) service: The Splunk service object server_uri: The server URI header: The authorization header event: The event dictionary maintenance_info: The maintenance info dictionary alerting_states: List of alerting states (e.g., ["red", "orange"]) ack_collection_keys: Set of object names that have acks (for efficient lookup) ack_collection_dict: Dictionary mapping object -> ack_record (for efficient lookup) Returns: dict: Context dictionary with all filtering-related information """ # Helper function for logging def log_info(msg): if helper: helper.log_info(msg) else: log.info(msg) def log_debug(msg): if helper: helper.log_debug(msg) else: log.debug(msg) def log_error(msg): if helper: helper.log_error(msg) else: log.error(msg) def log_warning(msg): if helper: helper.log_warning(msg) else: log.warning(msg) # get tenant_id tenant_id = event["tenant_id"] # get alias, object, object_category, priority alias = event.get("alias", None) # get object, we may have to deal with problematic non ASCII chars object = decode_unicode(event["object"]) object_category = event["object_category"] priority = event.get("priority", "medium") # get component from object_category (splk-) component = object_category.split("-")[1] # check if the ack is active for the object using KVstore lookup ack_active = False ack_age = 0 # Use KVstore lookup if ack collection data is provided, otherwise fall back to REST call if ack_collection_keys is not None and ack_collection_dict is not None: # Efficient KVstore lookup using composite key (object::object_category) composite_key = f"{object}::{object_category}" if composite_key in ack_collection_keys: ack_record = ack_collection_dict.get(composite_key) # Record should match since we use composite key if ack_record: ack_active_string = ack_record.get("ack_state", "inactive") ack_mtime_raw = ack_record.get("ack_mtime", None) if ack_mtime_raw: try: ack_mtime = float(ack_mtime_raw) except (ValueError, TypeError): ack_mtime = time.time() else: ack_mtime = time.time() if ack_active_string == "active": ack_age = time.time() - ack_mtime ack_active = True else: ack_active = False ack_age = 0 else: ack_active = False ack_age = 0 else: ack_active = False ack_age = 0 # Log the result (object_id will be available after we get it from KVstore) if ack_active: log_info( f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_active, " f"reason=acknowledgment_is_active, ack_age_seconds={ack_age:.2f}" ) else: log_info( f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_inactive, " f"reason=no_active_acknowledgment" ) else: # Fallback to REST call if ack collection data not provided try: ack_response = requests.post( f"{server_uri}/services/trackme/v2/ack/get_ack_for_object", headers=header, data=json.dumps( { "tenant_id": tenant_id, "object_category": object_category, "object_list": object, } ), verify=False, timeout=600, ) ack_response.raise_for_status() ack_response_json = ack_response.json() if ack_response_json: ack_response_json = ack_response_json[0] ack_active_string = ack_response_json.get("ack_state", "inactive") ack_mtime = float(ack_response_json.get("ack_mtime", time.time())) if ack_active_string == "active": ack_age = time.time() - ack_mtime ack_active = True log_info( f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_active, " f"reason=acknowledgment_is_active, ack_age_seconds={ack_age:.2f}" ) else: ack_active = False ack_age = 0 log_info( f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_inactive, " f"reason=no_active_acknowledgment" ) except Exception as e: log_error( f"activity=ack_check, tenant_id={tenant_id}, object={object}, object_category={object_category}, " f"decision=error, reason=exception_during_ack_retrieval, exception={str(e)}" ) ack_active = False ack_age = 0 # connect to the main tenant KVstore collection collection_main_name = f"kv_trackme_{component}_tenant_{tenant_id}" collection_main = service.kvstore[collection_main_name] # connect to the stateful KVstore collection collection_stateful_alerting_name = ( f"kv_trackme_stateful_alerting_tenant_{tenant_id}" ) collection_stateful_alerting = service.kvstore[ collection_stateful_alerting_name ] # get object_id, if not part of the upstream event, get it from the main KVstore # try both "keyid" and "key" fields as some events use different field names object_id = event.get("keyid", None) if not object_id: object_id = event.get("key", None) if not object_id: # Use cache if available cache_key = (object, object_category) if object_id_cache is not None and cache_key in object_id_cache: object_id = object_id_cache[cache_key] else: object_id = get_keyid_from_main_kvstore(helper, collection_main, object) if object_id_cache is not None: object_id_cache[cache_key] = object_id if not object_id: log_error( f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_category={object_category}, " f"object_id=None, decision=skip, reason=no_object_id_found_for_object" ) return None # get the object_state from the main KVstore collection (use cache if available) if object_state_cache is not None and object_id in object_state_cache: ( collection_object_state, collection_anomaly_reason, collection_status_message_json, collection_monitored_state, ) = object_state_cache[object_id] else: ( collection_object_state, collection_anomaly_reason, collection_status_message_json, collection_monitored_state, ) = get_object_state_from_main_kvstore(helper, collection_main, object_id) if object_state_cache is not None: object_state_cache[object_id] = ( collection_object_state, collection_anomaly_reason, collection_status_message_json, collection_monitored_state, ) # get object_state, collection_anomaly_reason and status_message_json from the event, if not present fallback to the collection values event_object_state = event.get("object_state", collection_object_state) event_anomaly_reason = event.get("anomaly_reason", collection_anomaly_reason) # normalize the anomaly_reason event_anomaly_reason = normalize_anomaly_reason(event_anomaly_reason) event_status_message_json = event.get( "status_message_json", collection_status_message_json ) event_monitored_state = event.get("monitored_state", collection_monitored_state) # merge object_state = ( event_object_state if event_object_state else collection_object_state ) anomaly_reason = ( event_anomaly_reason if event_anomaly_reason else collection_anomaly_reason ) status_message_json = ( event_status_message_json if event_status_message_json else collection_status_message_json ) # for monitored_state, prefer the collection value instead of the event value monitored_state = ( collection_monitored_state if collection_monitored_state else event_monitored_state ) # cannot process if object_state is None if not object_state: log_info( f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, " f"object_state=None, decision=skip, reason=failed_to_retrieve_object_state_from_kvstore" ) return None # get the stateful record, if any (use cache if available) if stateful_record_cache is not None and object_id in stateful_record_cache: stateful_record = stateful_record_cache[object_id] else: stateful_record = get_stateful_records_for_object_id( helper, collection_stateful_alerting, object_id ) if stateful_record_cache is not None: stateful_record_cache[object_id] = stateful_record if stateful_record: stateful_record_state = stateful_record.get("object_state", "unknown") stateful_record_incident_id = stateful_record.get("incident_id", "unknown") stateful_record_alert_status = stateful_record.get("alert_status", "unknown") log_info( f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, " f"object_state={object_state}, decision=stateful_record_found, " f"reason=active_stateful_record_exists, stateful_record_state={stateful_record_state}, " f"incident_id={stateful_record_incident_id}, alert_status={stateful_record_alert_status}" ) else: log_info( f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, " f"object_state={object_state}, decision=no_stateful_record, reason=no_active_stateful_record_found" ) # Helper function for tenant_in_scope def tenant_in_scope(tenant_id, tenants_scope): try: if isinstance(tenants_scope, list): scope_list = tenants_scope elif isinstance(tenants_scope, str): ts = tenants_scope.strip() if ts == "" or ts == "*": scope_list = ["*"] else: scope_list = [s.strip() for s in ts.split(",") if s.strip()] else: scope_list = ["*"] except Exception: scope_list = ["*"] if "*" in scope_list: return True return tenant_id in scope_list # Check maintenance status maintenance_active = False try: if maintenance_info and maintenance_info.get("maintenance"): if tenant_in_scope(tenant_id, maintenance_info.get("tenants_scope", ["*"])): maintenance_active = True except Exception: maintenance_active = False # Check bank holidays status bank_holidays_active = False try: if bank_holidays_info: payload = bank_holidays_info.get("payload", bank_holidays_info) if payload.get("is_active", False): bank_holidays_active = True except Exception: bank_holidays_active = False # Get event time event_time = float(event.get("_time", time.time())) # Get mtime from stateful_record if it exists mtime = None if stateful_record: mtime_raw = stateful_record.get("mtime") if mtime_raw is not None: try: mtime = float(mtime_raw) except (ValueError, TypeError) as e: log_warning( f"stateful_record exists but mtime cannot be converted to float: tenant_id={tenant_id}, object={object}, object_id={object_id}, mtime_raw={mtime_raw}, error={str(e)}. Skipping mtime check." ) mtime = None return { "tenant_id": tenant_id, "object": object, "object_category": object_category, "object_id": object_id, "object_state": object_state, "monitored_state": monitored_state, "component": component, "ack_active": ack_active, "ack_age": ack_age, "stateful_record": stateful_record, "maintenance_active": maintenance_active, "bank_holidays_active": bank_holidays_active, "event_time": event_time, "mtime": mtime, "priority": priority, "anomaly_reason": anomaly_reason, } def get_stateful_alert_config(service, tenant_id): """ Retrieve the stateful alert configuration for a tenant. Specifically, finds the stateful alert and extracts the orange_as_alerting_state parameter. Args: service: The Splunk service object tenant_id: The tenant identifier Returns: dict: Configuration dict with 'orange_as_alerting_state' (int, 0 or 1) and 'alerting_states' (list) """ try: # Search for stateful alerts for this tenant # Alert name pattern: "TrackMe alert tenant_id:{tenant_id} - *" alert_name_pattern = f"TrackMe alert tenant_id:{tenant_id} -" # Get all saved searches saved_searches = service.saved_searches # Find the stateful alert # Note: Iterating over saved_searches yields SavedSearch objects, not strings # We need to access the .name property to get the string name stateful_alert = None for alert_obj in saved_searches: alert_name = alert_obj.name if alert_name.startswith(alert_name_pattern): # Check if this alert has trackme_stateful_alert action actions = alert_obj.content.get("actions", "") if isinstance(actions, str): actions_list = [a.strip() for a in actions.split(",") if a.strip()] elif isinstance(actions, list): actions_list = actions else: actions_list = [] if "trackme_stateful_alert" in actions_list: stateful_alert = alert_obj log.info( f"activity=config_retrieval, tenant_id={tenant_id}, decision=found, " f"reason=stateful_alert_found, alert_name={alert_name}" ) break if not stateful_alert: log.info( f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, " f"reason=no_stateful_alert_found" ) # Default: orange is not considered an alerting state return { "orange_as_alerting_state": 0, "alerting_states": ["red"] } # Extract orange_as_alerting_state parameter orange_as_alerting_state = 0 try: param_value = stateful_alert.content.get("action.trackme_stateful_alert.param.orange_as_alerting_state") if param_value is not None: orange_as_alerting_state = int(param_value) except (ValueError, TypeError) as e: log.warning( f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, " f"reason=failed_to_parse_orange_as_alerting_state, exception={str(e)}" ) orange_as_alerting_state = 0 # Determine alerting states based on configuration alerting_states = ["red"] if orange_as_alerting_state: alerting_states.append("orange") log.info( f"activity=config_retrieval, tenant_id={tenant_id}, decision=retrieved, " f"reason=stateful_alert_config_retrieved, orange_as_alerting_state={orange_as_alerting_state}, " f"alerting_states={alerting_states}" ) return { "orange_as_alerting_state": orange_as_alerting_state, "alerting_states": alerting_states } except Exception as e: log.error( f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, " f"reason=error_retrieving_stateful_alert_configuration, exception={str(e)}" ) # On error, use default configuration return { "orange_as_alerting_state": 0, "alerting_states": ["red"] } def should_prefilter_yield_event(event, context, alerting_states, sourcetype): """ Pre-filtering function to determine if an event meets the basic conditions for being investigated by the stateful backend. Conditions: 1. No stateful record AND in alerting state (taking into account orange_as_alerting_state config) 2. Has stateful record BUT state changed (closure, update, or reopening) 3. SLA breach events (sourcetype=trackme:sla_breaches) - yield as updates if stateful record exists Args: event: The event dictionary context: The context dictionary from get_event_filtering_context() alerting_states: List of alerting states (e.g., ["red", "orange"]) sourcetype: The sourcetype of the event Returns: tuple: (should_yield: bool, reason: str or None) """ if context is None: return (False, "Failed to gather event context") stateful_record = context.get("stateful_record") object_state = context.get("object_state") # Condition 3: SLA breach events - yield if there's a stateful record (update scenario) # or if in alerting state (new incident scenario) if sourcetype == "trackme:sla_breaches": if stateful_record: return (True, "SLA breach event with existing stateful record - yield as update") elif object_state in alerting_states: return (True, f"SLA breach event in alerting state - object_state={object_state} is in alerting_states={alerting_states}") else: return (False, f"SLA breach event but object_state={object_state} is not in alerting_states={alerting_states}") # Condition 1: No stateful record AND in alerting state if not stateful_record: if object_state in alerting_states: return (True, f"No stateful record and object_state={object_state} is in alerting_states={alerting_states}") else: return (False, f"No stateful record but object_state={object_state} is not in alerting_states={alerting_states}") # Condition 2: Has stateful record BUT state changed (closure, update, or reopening) # Always yield when state changes, regardless of sourcetype stateful_record_object_state = stateful_record.get("object_state") if stateful_record_object_state != object_state: return (True, f"State changed: previous_state={stateful_record_object_state}, new_state={object_state}") # Condition 3: Has stateful record, state unchanged, but sourcetype indicates discrete event # For discrete events (SLA breaches, notables, flips), yield updates even if state unchanged # For continuous events (trackme:state), skip to avoid repeated updates # Check for flip events (state transitions) - discrete events if sourcetype == "trackme:flip": return (True, f"Flip event with existing stateful record - yield as update even if state unchanged") # Check for notable events (sourcetype might be from trackme_notable_idx) - discrete events if sourcetype and "notable" in sourcetype.lower(): return (True, f"Notable event with existing stateful record - yield as update even if state unchanged") # For trackme:state (continuous events), don't yield if state unchanged # This prevents repeated updates for entities that remain in the same alerting state # Continuous events should only create updates when state actually changes if sourcetype == "trackme:state": mtime = context.get("mtime") event_time = context.get("event_time") if mtime is not None and event_time is not None: return (False, f"Has stateful record, state unchanged, trackme:state continuous event - skipping to avoid duplicate updates: object_state={object_state}, event_time={event_time}, mtime={mtime}") return (False, f"Has stateful record and state unchanged: object_state={object_state}, sourcetype={sourcetype}") # For other sourcetypes (including SLA breaches handled earlier, but as fallback), yield if state unchanged # Discrete events should create updates even when state hasn't changed return (True, f"Has stateful record, state unchanged, but discrete event type - yield as update: object_state={object_state}, sourcetype={sourcetype}") def filter_stateful_alert_event( helper, server_uri, header, event, context, alerting_states, validation_cache=None, sourcetype=None, ): """ Function to filter a stateful alert event based on all filtering criteria. Returns True if the event should be processed, False if it should be skipped. Args: helper: The helper object (can be None for custom command usage) server_uri: The server URI header: The authorization header event: The event dictionary context: The context dictionary from get_event_filtering_context() alerting_states: List of alerting states (e.g., ["red", "orange"]) Returns: tuple: (should_process: bool, skip_reason: str or None) """ if context is None: return (False, "Failed to gather event context") # Helper function for logging def log_info(msg): if helper: helper.log_info(msg) else: log.info(msg) tenant_id = context["tenant_id"] object = context["object"] object_id = context["object_id"] object_state = context["object_state"] monitored_state = context["monitored_state"] component = context["component"] ack_active = context["ack_active"] ack_age = context["ack_age"] stateful_record = context["stateful_record"] maintenance_active = context["maintenance_active"] bank_holidays_active = context.get("bank_holidays_active", False) event_time = context["event_time"] mtime = context["mtime"] # if the monitored_state is "disabled", we need to skip the processing if monitored_state == "disabled": return ( False, f'monitored_state is disabled: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}', ) # if the stateful record is not found, we need to check if we should create a new thread if not stateful_record: if maintenance_active and object_state in alerting_states: return ( False, f"maintenance active, skipping new incident creation: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}", ) if bank_holidays_active and object_state in alerting_states: return ( False, f"bank holidays active, skipping new incident creation: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}", ) # Check if ack is active and older than 5 minutes - if yes, skip creating new incident if ack_active and ack_age > 300: # 300 seconds = 5 minutes return ( False, f'Ack is active and older than 5 minutes: tenant_id={tenant_id}, object={object}, object_id={object_id}, ack_active=True, ack_age={ack_age:.2f} seconds', ) # Safety check: Validate object_state via REST call before accepting entity for alert # This ensures the entity is actually in an alerting state according to the decision maker if object_state in alerting_states: # Use cache if available (with short TTL since state can change) cache_key = (object_id, tuple(sorted(alerting_states))) cache_ttl = 5 # Cache validation results for 5 seconds current_time = time.time() if validation_cache is not None and cache_key in validation_cache: cached_result, cached_time = validation_cache[cache_key] if current_time - cached_time < cache_ttl: is_valid, actual_object_state, error_message = cached_result else: # Cache expired, refresh is_valid, actual_object_state, error_message = validate_object_state_via_rest( helper=helper, server_uri=server_uri, header=header, tenant_id=tenant_id, component=component, object_id=object_id, alerting_states=alerting_states, ) validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time) else: is_valid, actual_object_state, error_message = validate_object_state_via_rest( helper=helper, server_uri=server_uri, header=header, tenant_id=tenant_id, component=component, object_id=object_id, alerting_states=alerting_states, ) if validation_cache is not None: validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time) if not is_valid: # Skip this event - the actual object_state from the decision maker is not in alerting_states return ( False, f"Object state validation failed: actual_object_state={actual_object_state}, expected_states={alerting_states}: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}", ) # do not process if no stateful_record and object_state is green/blue if object_state not in alerting_states: return ( False, f'no stateful record and non-alerting state: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_id={event.get("event_id")}', ) # check if the event should be skipped (for existing stateful records) if stateful_record: # Check if the current event's message_source_id (event_id) matches the one used for opened status # If they match, we should not allow an updated status event_id = event.get("event_id") stateful_record_message_source_id = stateful_record.get("message_source_id") if stateful_record_message_source_id and event_id and stateful_record_message_source_id == event_id: # The event_id (message_source_id) is the same as the one used for opened status # Skip processing this event to prevent duplicate updates return ( False, f"event_id matches the one used for opened status, preventing duplicate update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_id={event_id}", ) # Additional check: If the action would be updated BUT the sourcetype is trackme:state or trackme:flip # AND anomaly_reason has not changed, we should skip the updated event if sourcetype in ("trackme:state", "trackme:flip") and object_state in alerting_states: # Get current event's anomaly_reason from context (already normalized) current_anomaly_reason = context.get("anomaly_reason", []) if not isinstance(current_anomaly_reason, list): current_anomaly_reason = normalize_anomaly_reason(current_anomaly_reason) # Get stored anomaly_reason from stateful record (merge opened and updated) stored_opened_anomaly_reason = normalize_anomaly_reason(stateful_record.get("opened_anomaly_reason", [])) stored_updated_anomaly_reason = normalize_anomaly_reason(stateful_record.get("updated_anomaly_reason", [])) # Merge stored anomaly reasons (same logic as in helper) stored_anomaly_reason = list(set(stored_opened_anomaly_reason + stored_updated_anomaly_reason)) # Compare anomaly_reason lists (order doesn't matter, so compare as sets) if isinstance(current_anomaly_reason, list) and isinstance(stored_anomaly_reason, list): if set(current_anomaly_reason) == set(stored_anomaly_reason): # Anomaly reason hasn't changed, skip the update return ( False, f"anomaly_reason unchanged for trackme:state/trackme:flip event, skipping update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, sourcetype={sourcetype}, anomaly_reason={current_anomaly_reason}", ) # Get the object_state from the stateful record for comparison stateful_record_object_state = stateful_record.get("object_state") # Check if the state has changed - if it has, we should process even with active Ack state_changed = stateful_record_object_state != object_state # if we have a stateful record with mtime, and the event time is not newer than the mtime, we can skip the processing # UNLESS the state has changed (in which case we need to process to update/close the incident) if mtime is not None and event_time <= mtime and not state_changed: return ( False, f'event is not newer than stateful record last update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_time={time.strftime("%c", time.localtime(event_time))}, mtime={time.strftime("%c", time.localtime(mtime))}', ) # Early exit checks: If state hasn't changed and we're going to skip due to maintenance/bank holidays, # skip without validation to avoid unnecessary REST calls if not state_changed: if maintenance_active and object_state in alerting_states: return ( False, f"maintenance active, skipping incident update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}", ) if bank_holidays_active and object_state in alerting_states: return ( False, f"bank holidays active, skipping incident update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}", ) # Safety check: Validate object_state via REST call before accepting entity for alert update # This ensures the entity is actually in an alerting state according to the decision maker # Only validate if: # 1. Object is in alerting state AND # 2. We're actually going to process it (state changed OR not in maintenance/bank holidays) if object_state in alerting_states: # Use cache if available (with short TTL since state can change) cache_key = (object_id, tuple(sorted(alerting_states))) cache_ttl = 5 # Cache validation results for 5 seconds current_time = time.time() if validation_cache is not None and cache_key in validation_cache: cached_result, cached_time = validation_cache[cache_key] if current_time - cached_time < cache_ttl: is_valid, actual_object_state, error_message = cached_result else: # Cache expired, refresh is_valid, actual_object_state, error_message = validate_object_state_via_rest( helper=helper, server_uri=server_uri, header=header, tenant_id=tenant_id, component=component, object_id=object_id, alerting_states=alerting_states, ) validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time) else: is_valid, actual_object_state, error_message = validate_object_state_via_rest( helper=helper, server_uri=server_uri, header=header, tenant_id=tenant_id, component=component, object_id=object_id, alerting_states=alerting_states, ) if validation_cache is not None: validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time) if not is_valid: # Skip this event - the actual object_state from the decision maker is not in alerting_states return ( False, f"Object state validation failed: actual_object_state={actual_object_state}, expected_states={alerting_states}: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}", ) # IMPORTANT: If state has changed, process the event even if Ack is active # This ensures incidents are properly opened/updated/closed when state changes occur if state_changed: log_info( f"activity=filtering, tenant_id={tenant_id}, object={object}, object_id={object_id}, " f"object_state={object_state}, decision=process, reason=state_changed, " f"previous_state={stateful_record_object_state}, new_state={object_state}, ack_active={ack_active}" ) # Continue processing - don't skip due to Ack # Event passed all filtering criteria return (True, None) @Configuration(distributed=False) class TrackMeStateful(GeneratingCommand): tenant_id = Option( doc=""" **Syntax:** **tenant_id=**** **Description:** The tenant identifier.""", require=True, default=None, ) def generate(self, **kwargs): # Start performance counter start = time.time() # Get request info and set logging level reqinfo = trackme_reqinfo( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri ) log.setLevel(reqinfo["logging_level"]) # Get earliest and latest times earliest = self._metadata.searchinfo.earliest_time latest = self._metadata.searchinfo.latest_time # Get service server_uri = self._metadata.searchinfo.splunkd_uri session_key = self._metadata.searchinfo.session_key service = client.connect( owner="nobody", app="trackme", port=reqinfo["server_rest_port"], token=session_key, timeout=600, ) # Build header for REST calls header = { "Authorization": f"Splunk {session_key}", "Content-Type": "application/json", } # Retrieve stateful alert configuration to determine alerting states # This includes the "orange_as_alerting_state" setting alert_config = get_stateful_alert_config(service, self.tenant_id) alerting_states = alert_config["alerting_states"] orange_as_alerting_state = alert_config["orange_as_alerting_state"] log.info( f"activity=initialization, tenant_id={self.tenant_id}, decision=configured, " f"reason=alerting_states_configured, alerting_states={alerting_states}, " f"orange_as_alerting_state={orange_as_alerting_state}" ) # Load ack collection once for efficient lookup (instead of REST calls per event) # Use composite keys (object::object_category) to avoid collisions when same object exists in multiple categories ack_collection_keys = None ack_collection_dict = None try: ack_collection_name = f"kv_trackme_common_alerts_ack_tenant_{self.tenant_id}" ( ack_records, _, _, last_page, ) = search_kv_collection_sdkmode( log, service, ack_collection_name, page=1, page_count=0, orderby="keyid" ) # Build composite key dictionary to avoid collisions ack_collection_dict = {} ack_collection_keys = set() for record in ack_records: obj = record.get("object") obj_cat = record.get("object_category") if obj and obj_cat: composite_key = f"{obj}::{obj_cat}" ack_collection_dict[composite_key] = record ack_collection_keys.add(composite_key) log.info( f"activity=initialization, tenant_id={self.tenant_id}, decision=loaded, " f"reason=ack_collection_loaded_for_efficient_lookup, ack_records_count={len(ack_records)}, " f"unique_composite_keys={len(ack_collection_keys)}" ) except Exception as e: log.warning( f"activity=initialization, tenant_id={self.tenant_id}, decision=fallback, " f"reason=ack_collection_load_failed_will_use_rest_calls, exception={str(e)}" ) # Will fall back to REST calls per event if collection load fails ack_collection_keys = None ack_collection_dict = None # Get maintenance status try: endpoint = f"{server_uri}/services/trackme/v2/maintenance/check_global_maintenance_status" resp = requests.get(endpoint, headers=header, verify=False, timeout=60) resp.raise_for_status() maintenance_info = resp.json() # Normalize tenants_scope if isinstance(maintenance_info.get("tenants_scope"), str): ts = maintenance_info.get("tenants_scope", "*").strip() if ts == "" or ts == "*": maintenance_info["tenants_scope"] = ["*"] else: maintenance_info["tenants_scope"] = [ s.strip() for s in ts.split(",") if s.strip() ] elif not isinstance(maintenance_info.get("tenants_scope"), list): maintenance_info["tenants_scope"] = ["*"] except Exception as e: log.error( f"activity=maintenance_check, tenant_id={self.tenant_id}, decision=error, " f"reason=maintenance_check_failed, exception={str(e)}" ) maintenance_info = None # Get bank holidays status bank_holidays_info = None try: endpoint = f"{server_uri}/services/trackme/v2/bank_holidays/check_active" resp = requests.get(endpoint, headers=header, verify=False, timeout=60) resp.raise_for_status() bank_holidays_info = resp.json() log.info( f"activity=bank_holidays_check, tenant_id={self.tenant_id}, decision=retrieved, " f"reason=bank_holidays_status_retrieved, is_active={bank_holidays_info.get('payload', {}).get('is_active', False)}" ) except Exception as e: log.error( f"activity=bank_holidays_check, tenant_id={self.tenant_id}, decision=error, " f"reason=bank_holidays_check_failed, exception={str(e)}" ) bank_holidays_info = None # Build the simplified stateful alert search query search_query = remove_leading_spaces(f"""\ search (`trackme_idx({self.tenant_id})` (sourcetype=trackme:state) tenant_id="{self.tenant_id}" object_category="*" monitored_state="enabled") OR (`trackme_idx({self.tenant_id})` (sourcetype=trackme:flip) tenant_id="{self.tenant_id}" object_category="*" object_previous_state!="discovered") OR (`trackme_idx({self.tenant_id})` (sourcetype=trackme:sla_breaches) tenant_id="{self.tenant_id}" object_category="*") OR (`trackme_notable_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category=*) """) # Search parameters search_kwargs = { "earliest_time": earliest, "latest_time": latest, "count": 0, "output_mode": "json", } log.info( f"activity=initialization, tenant_id={self.tenant_id}, decision=start, reason=trackmestateful_command_started" ) # Create a minimal helper-like object for filtering functions class MinimalHelper: def log_info(self, msg): log.info(msg) def log_debug(self, msg): log.debug(msg) def log_error(self, msg): log.error(msg) def log_warning(self, msg): log.warning(msg) minimal_helper = MinimalHelper() # Initialize caches for performance optimization # These caches avoid redundant KVstore queries and REST calls for the same object across multiple events object_id_cache = {} # (object, object_category) -> object_id object_state_cache = {} # object_id -> (object_state, anomaly_reason, status_message_json, monitored_state) stateful_record_cache = {} # object_id -> stateful_record validation_cache = {} # (object_id, alerting_states_tuple) -> ((is_valid, actual_object_state, error_message), timestamp) score_cache = {} # object_id -> (score, score_definition) log.info( f"activity=initialization, tenant_id={self.tenant_id}, decision=caches_initialized, " f"reason=performance_optimization_caches_created" ) # Execute the search try: reader = run_splunk_search( service, search_query, search_kwargs, 24, 5 ) events_processed = 0 events_yielded = 0 events_filtered = 0 events_passed_filters = 0 # Track events that passed all filters (before deduplication) # Deduplication: Track latest event per (object_id, sourcetype) combination # This prevents race conditions where multiple events for the same object # in a single execution cause multiple updates (opened -> updated) events_by_key = {} # key: (object_id, sourcetype) -> (yield_event, _time, context) for item in reader: if isinstance(item, dict): events_processed += 1 # Get filtering context for this event try: event = item.get("_raw") if not isinstance(event, dict): if event is None: raise ValueError("Event _raw field is None") event = json.loads(event) # get original Splunk metadata: index, sourcetype, host, source index = item.get("index") sourcetype = item.get("sourcetype") host = item.get("host") source = item.get("source") yield_event = { "_time": event.get("_time", time.time()), "index": index, "sourcetype": sourcetype, "host": host, "source": source, "_raw": event, } for key, value in event.items(): yield_event[key] = value context = get_event_filtering_context( helper=minimal_helper, service=service, server_uri=server_uri, header=header, event=event, maintenance_info=maintenance_info, alerting_states=alerting_states, ack_collection_keys=ack_collection_keys, ack_collection_dict=ack_collection_dict, bank_holidays_info=bank_holidays_info, object_id_cache=object_id_cache, object_state_cache=object_state_cache, stateful_record_cache=stateful_record_cache, ) # If context gathering failed, skip this event if context is None: events_filtered += 1 log.info( f"activity=context_gathering, tenant_id={event.get('tenant_id')}, object={event.get('object')}, " f"decision=skip, reason=context_gathering_failed" ) continue # Pre-filter: Check if event meets basic conditions for stateful backend should_prefilter_yield, prefilter_reason = should_prefilter_yield_event( event=event, context=context, alerting_states=alerting_states, sourcetype=sourcetype, ) if not should_prefilter_yield: events_filtered += 1 log.info( f"activity=pre_filter, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, " f"decision=skip, reason={prefilter_reason if prefilter_reason else 'unknown_reason'}" ) continue # Event passed pre-filter, now apply full filtering logic # (ack checks, maintenance checks, validation, etc.) should_process, skip_reason = filter_stateful_alert_event( helper=minimal_helper, server_uri=server_uri, header=header, event=event, context=context, alerting_states=alerting_states, validation_cache=validation_cache, sourcetype=sourcetype, ) if should_process: # Event passed all filters - add to deduplication dict events_passed_filters += 1 # Use (object_id, sourcetype) as key to deduplicate within this execution object_id = context.get('object_id') dedup_key = (object_id, sourcetype) # Convert _time to float for consistent comparison (handles string epoch timestamps) event_time = float(yield_event.get("_time", time.time())) # Keep only the latest event per (object_id, sourcetype) combination if dedup_key not in events_by_key: events_by_key[dedup_key] = (yield_event, event_time, context, prefilter_reason) log.debug( f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={object_id}, sourcetype={sourcetype}, decision=added, " f"reason=first_event_for_object_sourcetype_combination, event_time={event_time}" ) else: existing_event, existing_time, existing_context, existing_reason = events_by_key[dedup_key] # Ensure existing_time is also float for safe comparison if not isinstance(existing_time, (int, float)): existing_time = float(existing_time) if event_time > existing_time: # This event is newer, replace the existing one events_by_key[dedup_key] = (yield_event, event_time, context, prefilter_reason) log.debug( f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={object_id}, sourcetype={sourcetype}, decision=replaced, " f"reason=newer_event_found, old_time={existing_time}, new_time={event_time}" ) else: log.debug( f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={object_id}, sourcetype={sourcetype}, decision=skipped, " f"reason=older_event_ignored, existing_time={existing_time}, event_time={event_time}" ) log.info( f"activity=filtering, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, " f"decision=yield, reason=passed_pre_filter_and_full_filter, prefilter_reason={prefilter_reason}" ) else: events_filtered += 1 log.info( f"activity=filtering, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, " f"decision=skip, reason={skip_reason if skip_reason else 'unknown_reason'}" ) except Exception as e: events_filtered += 1 # Safely extract event info for logging (event might not be a dict if JSON parsing failed) tenant_id_info = "unknown" object_info = "unknown" try: if isinstance(event, dict): tenant_id_info = event.get('tenant_id', 'unknown') object_info = event.get('object', 'unknown') except Exception: pass log.error( f"activity=event_processing, tenant_id={tenant_id_info}, object={object_info}, " f"decision=error, reason=exception_during_event_processing, exception={str(e)}" ) # On error, we skip the event (fail-safe behavior) continue # Deduplication: Extract only the latest event per (object_id, sourcetype) combination # This prevents race conditions where multiple events for the same object in a single execution # would cause multiple updates (opened -> updated) in the backend events_yielded_records = [] events_yielded = len(events_by_key) # Count deduplicated events for dedup_key, (yield_event, event_time, context, prefilter_reason) in events_by_key.items(): object_id, sourcetype = dedup_key # Load score and score_definition from component data via REST call tenant_id = context.get('tenant_id') component = context.get('component') # Check cache first to avoid redundant REST calls score = None score_definition = None if object_id in score_cache: score, score_definition = score_cache[object_id] log.debug( f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, " f"decision=cache_hit, score={score}" ) elif tenant_id and component and object_id: try: # Build the REST endpoint URL url = f"{server_uri}/services/trackme/v2/component/load_component_data" # Prepare query parameters params = { "tenant_id": tenant_id, "component": component, "pagination_mode": "local", "filter_key": object_id, } log.debug( f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, " f"url={url}, params={params}" ) # Make the REST call response = requests.get( url, headers=header, verify=False, params=params, timeout=30, ) # Check response status if response.status_code == 200: try: response_json = response.json() # Handle different response structures: # - If pagination_mode="local": response is {"payload": [records...], "status": 200} # Splunk REST framework may unwrap to just [records...] # - If pagination_mode="remote": response is {"payload": {"data": [records...], "last_page": ...}, "status": 200} # Check if response is wrapped in "payload" if isinstance(response_json, dict) and "payload" in response_json: payload = response_json["payload"] # Check if payload has "data" key (remote pagination) or is a list (local pagination) if isinstance(payload, dict) and "data" in payload: data = payload["data"] elif isinstance(payload, list): data = payload else: data = [] # Check if response is a list directly (local pagination, unwrapped) elif isinstance(response_json, list): data = response_json # Check if response has "data" key directly elif isinstance(response_json, dict) and "data" in response_json: data = response_json["data"] else: data = [] if data and len(data) > 0: # Get the first record (should be only one when filtering by object_id) record = data[0] # Extract score and score_definition score_raw = record.get("score") score_definition = record.get("score_definition") # Convert score to integer (score is always an integer) score = None if score_raw is not None: try: score = int(float(score_raw)) # Convert to float first to handle string "100.0", then to int except (ValueError, TypeError): log.warning( f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, " f"decision=error, reason=failed_to_convert_score_to_int, score_raw={score_raw}" ) score = None # Cache the results score_cache[object_id] = (score, score_definition) log.debug( f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, " f"score={score}, decision=cached" ) else: # Cache None values when data is empty to avoid redundant REST calls # This handles the case where REST call succeeds but no records found for object_id score_cache[object_id] = (None, None) log.debug( f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, " f"decision=cached_empty, reason=no_data_returned_from_rest_call" ) except (json.JSONDecodeError, KeyError, IndexError, TypeError) as e: log.warning( f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, " f"decision=error, reason=failed_to_parse_response, exception={str(e)}" ) # Cache None values to avoid repeated failed calls score_cache[object_id] = (None, None) else: log.warning( f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, " f"decision=error, reason=rest_call_failed, status_code={response.status_code}, " f"response_text={response.text[:200]}" ) # Cache None values to avoid repeated failed calls score_cache[object_id] = (None, None) except Exception as e: log.warning( f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, " f"decision=error, reason=exception_during_rest_call, exception={str(e)}" ) # Cache None values to avoid repeated failed calls score_cache[object_id] = (None, None) # Add to yield_event if present if score is not None: yield_event["score"] = score if score_definition is not None: yield_event["score_definition"] = score_definition events_yielded_records.append(yield_event) log.debug( f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, " f"object_id={object_id}, sourcetype={sourcetype}, decision=final_yield, " f"reason=latest_event_selected_for_object_sourcetype_combination, event_time={event_time}" ) # Log deduplication summary if duplicates were found if len(events_by_key) < events_passed_filters: duplicates_removed = events_passed_filters - len(events_by_key) log.info( f"activity=deduplication, tenant_id={self.tenant_id}, decision=completed, " f"reason=duplicate_events_deduplicated, events_passed_filters={events_passed_filters}, " f"events_after_dedup={len(events_by_key)}, duplicates_removed={duplicates_removed}" ) # yield events (now deduplicated) for yield_record in generate_fields(events_yielded_records): yield yield_record log.info( f"activity=completion, tenant_id={self.tenant_id}, decision=terminated, " f"reason=trackmestateful_command_completed, run_time_seconds={round(time.time() - start, 3)}, " f"events_processed={events_processed}, events_yielded={events_yielded}, events_filtered={events_filtered}, " f"cache_stats=object_id_cache_size={len(object_id_cache)}, object_state_cache_size={len(object_state_cache)}, " f"stateful_record_cache_size={len(stateful_record_cache)}, validation_cache_size={len(validation_cache)}, " f"score_cache_size={len(score_cache)}" ) except Exception as e: log.error( f"activity=execution, tenant_id={self.tenant_id}, decision=error, " f"reason=trackmestateful_command_failed, exception={str(e)}" ) # Raise exception for main search failures - don't yield error events # Error events with missing required fields (object, object_category) would crash the downstream backend raise Exception( f"trackmestateful command failed for tenant_id={self.tenant_id}: {str(e)}" ) from e dispatch(TrackMeStateful, sys.argv, sys.stdin, sys.stdout, __name__)