You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmestateful.py

1414 lines
66 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import json
import logging
import os
import sys
import time
# Third-party library imports
import requests
import urllib3
from logging.handlers import RotatingFileHandler
# Disable insecure request warnings for urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# set splunkhome
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
"%s/var/log/splunk/trackme_stateful.log" % splunkhome,
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# Import Splunk libs
from splunklib.searchcommands import (
dispatch,
GeneratingCommand,
Configuration,
Option,
validators,
)
# Import trackme libs
from trackme_libs import trackme_reqinfo, run_splunk_search
from trackme_libs_get_data import search_kv_collection_sdkmode
import splunklib.client as client
# Import trackme libs utils
from trackme_libs_utils import decode_unicode, normalize_anomaly_reason, remove_leading_spaces
# Import helper functions from stateful alert helper (these are utility functions, not filtering logic)
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "trackme"))
from modalert_trackme_stateful_alert_helper import (
get_keyid_from_main_kvstore,
get_object_state_from_main_kvstore,
get_stateful_records_for_object_id,
validate_object_state_via_rest,
)
def generate_fields(records):
"""
Fhis function ensures that records have the same list of fields to allow Splunk to automatically extract these fields
If a given result does not have a given field, it will be added to the record as an empty value
"""
all_keys = set()
for record in records:
all_keys.update(record.keys())
for record in records:
for key in all_keys:
if key not in record:
record[key] = ""
yield record
def get_event_filtering_context(
helper,
service,
server_uri,
header,
event,
maintenance_info,
alerting_states,
ack_collection_keys=None,
ack_collection_dict=None,
bank_holidays_info=None,
object_id_cache=None,
object_state_cache=None,
stateful_record_cache=None,
):
"""
Function to gather all context needed for filtering a stateful alert event.
This includes object_id, object_state, stateful_record, ack status, etc.
Args:
helper: The helper object (can be None for custom command usage)
service: The Splunk service object
server_uri: The server URI
header: The authorization header
event: The event dictionary
maintenance_info: The maintenance info dictionary
alerting_states: List of alerting states (e.g., ["red", "orange"])
ack_collection_keys: Set of object names that have acks (for efficient lookup)
ack_collection_dict: Dictionary mapping object -> ack_record (for efficient lookup)
Returns:
dict: Context dictionary with all filtering-related information
"""
# Helper function for logging
def log_info(msg):
if helper:
helper.log_info(msg)
else:
log.info(msg)
def log_debug(msg):
if helper:
helper.log_debug(msg)
else:
log.debug(msg)
def log_error(msg):
if helper:
helper.log_error(msg)
else:
log.error(msg)
def log_warning(msg):
if helper:
helper.log_warning(msg)
else:
log.warning(msg)
# get tenant_id
tenant_id = event["tenant_id"]
# get alias, object, object_category, priority
alias = event.get("alias", None)
# get object, we may have to deal with problematic non ASCII chars
object = decode_unicode(event["object"])
object_category = event["object_category"]
priority = event.get("priority", "medium")
# get component from object_category (splk-<component>)
component = object_category.split("-")[1]
# check if the ack is active for the object using KVstore lookup
ack_active = False
ack_age = 0
# Use KVstore lookup if ack collection data is provided, otherwise fall back to REST call
if ack_collection_keys is not None and ack_collection_dict is not None:
# Efficient KVstore lookup using composite key (object::object_category)
composite_key = f"{object}::{object_category}"
if composite_key in ack_collection_keys:
ack_record = ack_collection_dict.get(composite_key)
# Record should match since we use composite key
if ack_record:
ack_active_string = ack_record.get("ack_state", "inactive")
ack_mtime_raw = ack_record.get("ack_mtime", None)
if ack_mtime_raw:
try:
ack_mtime = float(ack_mtime_raw)
except (ValueError, TypeError):
ack_mtime = time.time()
else:
ack_mtime = time.time()
if ack_active_string == "active":
ack_age = time.time() - ack_mtime
ack_active = True
else:
ack_active = False
ack_age = 0
else:
ack_active = False
ack_age = 0
else:
ack_active = False
ack_age = 0
# Log the result (object_id will be available after we get it from KVstore)
if ack_active:
log_info(
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_active, "
f"reason=acknowledgment_is_active, ack_age_seconds={ack_age:.2f}"
)
else:
log_info(
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_inactive, "
f"reason=no_active_acknowledgment"
)
else:
# Fallback to REST call if ack collection data not provided
try:
ack_response = requests.post(
f"{server_uri}/services/trackme/v2/ack/get_ack_for_object",
headers=header,
data=json.dumps(
{
"tenant_id": tenant_id,
"object_category": object_category,
"object_list": object,
}
),
verify=False,
timeout=600,
)
ack_response.raise_for_status()
ack_response_json = ack_response.json()
if ack_response_json:
ack_response_json = ack_response_json[0]
ack_active_string = ack_response_json.get("ack_state", "inactive")
ack_mtime = float(ack_response_json.get("ack_mtime", time.time()))
if ack_active_string == "active":
ack_age = time.time() - ack_mtime
ack_active = True
log_info(
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_active, "
f"reason=acknowledgment_is_active, ack_age_seconds={ack_age:.2f}"
)
else:
ack_active = False
ack_age = 0
log_info(
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_inactive, "
f"reason=no_active_acknowledgment"
)
except Exception as e:
log_error(
f"activity=ack_check, tenant_id={tenant_id}, object={object}, object_category={object_category}, "
f"decision=error, reason=exception_during_ack_retrieval, exception={str(e)}"
)
ack_active = False
ack_age = 0
# connect to the main tenant KVstore collection
collection_main_name = f"kv_trackme_{component}_tenant_{tenant_id}"
collection_main = service.kvstore[collection_main_name]
# connect to the stateful KVstore collection
collection_stateful_alerting_name = (
f"kv_trackme_stateful_alerting_tenant_{tenant_id}"
)
collection_stateful_alerting = service.kvstore[
collection_stateful_alerting_name
]
# get object_id, if not part of the upstream event, get it from the main KVstore
# try both "keyid" and "key" fields as some events use different field names
object_id = event.get("keyid", None)
if not object_id:
object_id = event.get("key", None)
if not object_id:
# Use cache if available
cache_key = (object, object_category)
if object_id_cache is not None and cache_key in object_id_cache:
object_id = object_id_cache[cache_key]
else:
object_id = get_keyid_from_main_kvstore(helper, collection_main, object)
if object_id_cache is not None:
object_id_cache[cache_key] = object_id
if not object_id:
log_error(
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_category={object_category}, "
f"object_id=None, decision=skip, reason=no_object_id_found_for_object"
)
return None
# get the object_state from the main KVstore collection (use cache if available)
if object_state_cache is not None and object_id in object_state_cache:
(
collection_object_state,
collection_anomaly_reason,
collection_status_message_json,
collection_monitored_state,
) = object_state_cache[object_id]
else:
(
collection_object_state,
collection_anomaly_reason,
collection_status_message_json,
collection_monitored_state,
) = get_object_state_from_main_kvstore(helper, collection_main, object_id)
if object_state_cache is not None:
object_state_cache[object_id] = (
collection_object_state,
collection_anomaly_reason,
collection_status_message_json,
collection_monitored_state,
)
# get object_state, collection_anomaly_reason and status_message_json from the event, if not present fallback to the collection values
event_object_state = event.get("object_state", collection_object_state)
event_anomaly_reason = event.get("anomaly_reason", collection_anomaly_reason)
# normalize the anomaly_reason
event_anomaly_reason = normalize_anomaly_reason(event_anomaly_reason)
event_status_message_json = event.get(
"status_message_json", collection_status_message_json
)
event_monitored_state = event.get("monitored_state", collection_monitored_state)
# merge
object_state = (
event_object_state if event_object_state else collection_object_state
)
anomaly_reason = (
event_anomaly_reason if event_anomaly_reason else collection_anomaly_reason
)
status_message_json = (
event_status_message_json
if event_status_message_json
else collection_status_message_json
)
# for monitored_state, prefer the collection value instead of the event value
monitored_state = (
collection_monitored_state
if collection_monitored_state
else event_monitored_state
)
# cannot process if object_state is None
if not object_state:
log_info(
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
f"object_state=None, decision=skip, reason=failed_to_retrieve_object_state_from_kvstore"
)
return None
# get the stateful record, if any (use cache if available)
if stateful_record_cache is not None and object_id in stateful_record_cache:
stateful_record = stateful_record_cache[object_id]
else:
stateful_record = get_stateful_records_for_object_id(
helper, collection_stateful_alerting, object_id
)
if stateful_record_cache is not None:
stateful_record_cache[object_id] = stateful_record
if stateful_record:
stateful_record_state = stateful_record.get("object_state", "unknown")
stateful_record_incident_id = stateful_record.get("incident_id", "unknown")
stateful_record_alert_status = stateful_record.get("alert_status", "unknown")
log_info(
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
f"object_state={object_state}, decision=stateful_record_found, "
f"reason=active_stateful_record_exists, stateful_record_state={stateful_record_state}, "
f"incident_id={stateful_record_incident_id}, alert_status={stateful_record_alert_status}"
)
else:
log_info(
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
f"object_state={object_state}, decision=no_stateful_record, reason=no_active_stateful_record_found"
)
# Helper function for tenant_in_scope
def tenant_in_scope(tenant_id, tenants_scope):
try:
if isinstance(tenants_scope, list):
scope_list = tenants_scope
elif isinstance(tenants_scope, str):
ts = tenants_scope.strip()
if ts == "" or ts == "*":
scope_list = ["*"]
else:
scope_list = [s.strip() for s in ts.split(",") if s.strip()]
else:
scope_list = ["*"]
except Exception:
scope_list = ["*"]
if "*" in scope_list:
return True
return tenant_id in scope_list
# Check maintenance status
maintenance_active = False
try:
if maintenance_info and maintenance_info.get("maintenance"):
if tenant_in_scope(tenant_id, maintenance_info.get("tenants_scope", ["*"])):
maintenance_active = True
except Exception:
maintenance_active = False
# Check bank holidays status
bank_holidays_active = False
try:
if bank_holidays_info:
payload = bank_holidays_info.get("payload", bank_holidays_info)
if payload.get("is_active", False):
bank_holidays_active = True
except Exception:
bank_holidays_active = False
# Get event time
event_time = float(event.get("_time", time.time()))
# Get mtime from stateful_record if it exists
mtime = None
if stateful_record:
mtime_raw = stateful_record.get("mtime")
if mtime_raw is not None:
try:
mtime = float(mtime_raw)
except (ValueError, TypeError) as e:
log_warning(
f"stateful_record exists but mtime cannot be converted to float: tenant_id={tenant_id}, object={object}, object_id={object_id}, mtime_raw={mtime_raw}, error={str(e)}. Skipping mtime check."
)
mtime = None
return {
"tenant_id": tenant_id,
"object": object,
"object_category": object_category,
"object_id": object_id,
"object_state": object_state,
"monitored_state": monitored_state,
"component": component,
"ack_active": ack_active,
"ack_age": ack_age,
"stateful_record": stateful_record,
"maintenance_active": maintenance_active,
"bank_holidays_active": bank_holidays_active,
"event_time": event_time,
"mtime": mtime,
"priority": priority,
"anomaly_reason": anomaly_reason,
}
def get_stateful_alert_config(service, tenant_id):
"""
Retrieve the stateful alert configuration for a tenant.
Specifically, finds the stateful alert and extracts the orange_as_alerting_state parameter.
Args:
service: The Splunk service object
tenant_id: The tenant identifier
Returns:
dict: Configuration dict with 'orange_as_alerting_state' (int, 0 or 1) and 'alerting_states' (list)
"""
try:
# Search for stateful alerts for this tenant
# Alert name pattern: "TrackMe alert tenant_id:{tenant_id} - *"
alert_name_pattern = f"TrackMe alert tenant_id:{tenant_id} -"
# Get all saved searches
saved_searches = service.saved_searches
# Find the stateful alert
# Note: Iterating over saved_searches yields SavedSearch objects, not strings
# We need to access the .name property to get the string name
stateful_alert = None
for alert_obj in saved_searches:
alert_name = alert_obj.name
if alert_name.startswith(alert_name_pattern):
# Check if this alert has trackme_stateful_alert action
actions = alert_obj.content.get("actions", "")
if isinstance(actions, str):
actions_list = [a.strip() for a in actions.split(",") if a.strip()]
elif isinstance(actions, list):
actions_list = actions
else:
actions_list = []
if "trackme_stateful_alert" in actions_list:
stateful_alert = alert_obj
log.info(
f"activity=config_retrieval, tenant_id={tenant_id}, decision=found, "
f"reason=stateful_alert_found, alert_name={alert_name}"
)
break
if not stateful_alert:
log.info(
f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, "
f"reason=no_stateful_alert_found"
)
# Default: orange is not considered an alerting state
return {
"orange_as_alerting_state": 0,
"alerting_states": ["red"]
}
# Extract orange_as_alerting_state parameter
orange_as_alerting_state = 0
try:
param_value = stateful_alert.content.get("action.trackme_stateful_alert.param.orange_as_alerting_state")
if param_value is not None:
orange_as_alerting_state = int(param_value)
except (ValueError, TypeError) as e:
log.warning(
f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, "
f"reason=failed_to_parse_orange_as_alerting_state, exception={str(e)}"
)
orange_as_alerting_state = 0
# Determine alerting states based on configuration
alerting_states = ["red"]
if orange_as_alerting_state:
alerting_states.append("orange")
log.info(
f"activity=config_retrieval, tenant_id={tenant_id}, decision=retrieved, "
f"reason=stateful_alert_config_retrieved, orange_as_alerting_state={orange_as_alerting_state}, "
f"alerting_states={alerting_states}"
)
return {
"orange_as_alerting_state": orange_as_alerting_state,
"alerting_states": alerting_states
}
except Exception as e:
log.error(
f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, "
f"reason=error_retrieving_stateful_alert_configuration, exception={str(e)}"
)
# On error, use default configuration
return {
"orange_as_alerting_state": 0,
"alerting_states": ["red"]
}
def should_prefilter_yield_event(event, context, alerting_states, sourcetype):
"""
Pre-filtering function to determine if an event meets the basic conditions
for being investigated by the stateful backend.
Conditions:
1. No stateful record AND in alerting state (taking into account orange_as_alerting_state config)
2. Has stateful record BUT state changed (closure, update, or reopening)
3. SLA breach events (sourcetype=trackme:sla_breaches) - yield as updates if stateful record exists
Args:
event: The event dictionary
context: The context dictionary from get_event_filtering_context()
alerting_states: List of alerting states (e.g., ["red", "orange"])
sourcetype: The sourcetype of the event
Returns:
tuple: (should_yield: bool, reason: str or None)
"""
if context is None:
return (False, "Failed to gather event context")
stateful_record = context.get("stateful_record")
object_state = context.get("object_state")
# Condition 3: SLA breach events - yield if there's a stateful record (update scenario)
# or if in alerting state (new incident scenario)
if sourcetype == "trackme:sla_breaches":
if stateful_record:
return (True, "SLA breach event with existing stateful record - yield as update")
elif object_state in alerting_states:
return (True, f"SLA breach event in alerting state - object_state={object_state} is in alerting_states={alerting_states}")
else:
return (False, f"SLA breach event but object_state={object_state} is not in alerting_states={alerting_states}")
# Condition 1: No stateful record AND in alerting state
if not stateful_record:
if object_state in alerting_states:
return (True, f"No stateful record and object_state={object_state} is in alerting_states={alerting_states}")
else:
return (False, f"No stateful record but object_state={object_state} is not in alerting_states={alerting_states}")
# Condition 2: Has stateful record BUT state changed (closure, update, or reopening)
# Always yield when state changes, regardless of sourcetype
stateful_record_object_state = stateful_record.get("object_state")
if stateful_record_object_state != object_state:
return (True, f"State changed: previous_state={stateful_record_object_state}, new_state={object_state}")
# Condition 3: Has stateful record, state unchanged, but sourcetype indicates discrete event
# For discrete events (SLA breaches, notables, flips), yield updates even if state unchanged
# For continuous events (trackme:state), skip to avoid repeated updates
# Check for flip events (state transitions) - discrete events
if sourcetype == "trackme:flip":
return (True, f"Flip event with existing stateful record - yield as update even if state unchanged")
# Check for notable events (sourcetype might be from trackme_notable_idx) - discrete events
if sourcetype and "notable" in sourcetype.lower():
return (True, f"Notable event with existing stateful record - yield as update even if state unchanged")
# For trackme:state (continuous events), don't yield if state unchanged
# This prevents repeated updates for entities that remain in the same alerting state
# Continuous events should only create updates when state actually changes
if sourcetype == "trackme:state":
mtime = context.get("mtime")
event_time = context.get("event_time")
if mtime is not None and event_time is not None:
return (False, f"Has stateful record, state unchanged, trackme:state continuous event - skipping to avoid duplicate updates: object_state={object_state}, event_time={event_time}, mtime={mtime}")
return (False, f"Has stateful record and state unchanged: object_state={object_state}, sourcetype={sourcetype}")
# For other sourcetypes (including SLA breaches handled earlier, but as fallback), yield if state unchanged
# Discrete events should create updates even when state hasn't changed
return (True, f"Has stateful record, state unchanged, but discrete event type - yield as update: object_state={object_state}, sourcetype={sourcetype}")
def filter_stateful_alert_event(
helper,
server_uri,
header,
event,
context,
alerting_states,
validation_cache=None,
sourcetype=None,
):
"""
Function to filter a stateful alert event based on all filtering criteria.
Returns True if the event should be processed, False if it should be skipped.
Args:
helper: The helper object (can be None for custom command usage)
server_uri: The server URI
header: The authorization header
event: The event dictionary
context: The context dictionary from get_event_filtering_context()
alerting_states: List of alerting states (e.g., ["red", "orange"])
Returns:
tuple: (should_process: bool, skip_reason: str or None)
"""
if context is None:
return (False, "Failed to gather event context")
# Helper function for logging
def log_info(msg):
if helper:
helper.log_info(msg)
else:
log.info(msg)
tenant_id = context["tenant_id"]
object = context["object"]
object_id = context["object_id"]
object_state = context["object_state"]
monitored_state = context["monitored_state"]
component = context["component"]
ack_active = context["ack_active"]
ack_age = context["ack_age"]
stateful_record = context["stateful_record"]
maintenance_active = context["maintenance_active"]
bank_holidays_active = context.get("bank_holidays_active", False)
event_time = context["event_time"]
mtime = context["mtime"]
# if the monitored_state is "disabled", we need to skip the processing
if monitored_state == "disabled":
return (
False,
f'monitored_state is disabled: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}',
)
# if the stateful record is not found, we need to check if we should create a new thread
if not stateful_record:
if maintenance_active and object_state in alerting_states:
return (
False,
f"maintenance active, skipping new incident creation: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
)
if bank_holidays_active and object_state in alerting_states:
return (
False,
f"bank holidays active, skipping new incident creation: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
)
# Check if ack is active and older than 5 minutes - if yes, skip creating new incident
if ack_active and ack_age > 300: # 300 seconds = 5 minutes
return (
False,
f'Ack is active and older than 5 minutes: tenant_id={tenant_id}, object={object}, object_id={object_id}, ack_active=True, ack_age={ack_age:.2f} seconds',
)
# Safety check: Validate object_state via REST call before accepting entity for alert
# This ensures the entity is actually in an alerting state according to the decision maker
if object_state in alerting_states:
# Use cache if available (with short TTL since state can change)
cache_key = (object_id, tuple(sorted(alerting_states)))
cache_ttl = 5 # Cache validation results for 5 seconds
current_time = time.time()
if validation_cache is not None and cache_key in validation_cache:
cached_result, cached_time = validation_cache[cache_key]
if current_time - cached_time < cache_ttl:
is_valid, actual_object_state, error_message = cached_result
else:
# Cache expired, refresh
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
helper=helper,
server_uri=server_uri,
header=header,
tenant_id=tenant_id,
component=component,
object_id=object_id,
alerting_states=alerting_states,
)
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
else:
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
helper=helper,
server_uri=server_uri,
header=header,
tenant_id=tenant_id,
component=component,
object_id=object_id,
alerting_states=alerting_states,
)
if validation_cache is not None:
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
if not is_valid:
# Skip this event - the actual object_state from the decision maker is not in alerting_states
return (
False,
f"Object state validation failed: actual_object_state={actual_object_state}, expected_states={alerting_states}: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
)
# do not process if no stateful_record and object_state is green/blue
if object_state not in alerting_states:
return (
False,
f'no stateful record and non-alerting state: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_id={event.get("event_id")}',
)
# check if the event should be skipped (for existing stateful records)
if stateful_record:
# Check if the current event's message_source_id (event_id) matches the one used for opened status
# If they match, we should not allow an updated status
event_id = event.get("event_id")
stateful_record_message_source_id = stateful_record.get("message_source_id")
if stateful_record_message_source_id and event_id and stateful_record_message_source_id == event_id:
# The event_id (message_source_id) is the same as the one used for opened status
# Skip processing this event to prevent duplicate updates
return (
False,
f"event_id matches the one used for opened status, preventing duplicate update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_id={event_id}",
)
# Additional check: If the action would be updated BUT the sourcetype is trackme:state or trackme:flip
# AND anomaly_reason has not changed, we should skip the updated event
if sourcetype in ("trackme:state", "trackme:flip") and object_state in alerting_states:
# Get current event's anomaly_reason from context (already normalized)
current_anomaly_reason = context.get("anomaly_reason", [])
if not isinstance(current_anomaly_reason, list):
current_anomaly_reason = normalize_anomaly_reason(current_anomaly_reason)
# Get stored anomaly_reason from stateful record (merge opened and updated)
stored_opened_anomaly_reason = normalize_anomaly_reason(stateful_record.get("opened_anomaly_reason", []))
stored_updated_anomaly_reason = normalize_anomaly_reason(stateful_record.get("updated_anomaly_reason", []))
# Merge stored anomaly reasons (same logic as in helper)
stored_anomaly_reason = list(set(stored_opened_anomaly_reason + stored_updated_anomaly_reason))
# Compare anomaly_reason lists (order doesn't matter, so compare as sets)
if isinstance(current_anomaly_reason, list) and isinstance(stored_anomaly_reason, list):
if set(current_anomaly_reason) == set(stored_anomaly_reason):
# Anomaly reason hasn't changed, skip the update
return (
False,
f"anomaly_reason unchanged for trackme:state/trackme:flip event, skipping update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, sourcetype={sourcetype}, anomaly_reason={current_anomaly_reason}",
)
# Get the object_state from the stateful record for comparison
stateful_record_object_state = stateful_record.get("object_state")
# Check if the state has changed - if it has, we should process even with active Ack
state_changed = stateful_record_object_state != object_state
# if we have a stateful record with mtime, and the event time is not newer than the mtime, we can skip the processing
# UNLESS the state has changed (in which case we need to process to update/close the incident)
if mtime is not None and event_time <= mtime and not state_changed:
return (
False,
f'event is not newer than stateful record last update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_time={time.strftime("%c", time.localtime(event_time))}, mtime={time.strftime("%c", time.localtime(mtime))}',
)
# Early exit checks: If state hasn't changed and we're going to skip due to maintenance/bank holidays,
# skip without validation to avoid unnecessary REST calls
if not state_changed:
if maintenance_active and object_state in alerting_states:
return (
False,
f"maintenance active, skipping incident update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
)
if bank_holidays_active and object_state in alerting_states:
return (
False,
f"bank holidays active, skipping incident update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
)
# Safety check: Validate object_state via REST call before accepting entity for alert update
# This ensures the entity is actually in an alerting state according to the decision maker
# Only validate if:
# 1. Object is in alerting state AND
# 2. We're actually going to process it (state changed OR not in maintenance/bank holidays)
if object_state in alerting_states:
# Use cache if available (with short TTL since state can change)
cache_key = (object_id, tuple(sorted(alerting_states)))
cache_ttl = 5 # Cache validation results for 5 seconds
current_time = time.time()
if validation_cache is not None and cache_key in validation_cache:
cached_result, cached_time = validation_cache[cache_key]
if current_time - cached_time < cache_ttl:
is_valid, actual_object_state, error_message = cached_result
else:
# Cache expired, refresh
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
helper=helper,
server_uri=server_uri,
header=header,
tenant_id=tenant_id,
component=component,
object_id=object_id,
alerting_states=alerting_states,
)
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
else:
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
helper=helper,
server_uri=server_uri,
header=header,
tenant_id=tenant_id,
component=component,
object_id=object_id,
alerting_states=alerting_states,
)
if validation_cache is not None:
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
if not is_valid:
# Skip this event - the actual object_state from the decision maker is not in alerting_states
return (
False,
f"Object state validation failed: actual_object_state={actual_object_state}, expected_states={alerting_states}: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
)
# IMPORTANT: If state has changed, process the event even if Ack is active
# This ensures incidents are properly opened/updated/closed when state changes occur
if state_changed:
log_info(
f"activity=filtering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
f"object_state={object_state}, decision=process, reason=state_changed, "
f"previous_state={stateful_record_object_state}, new_state={object_state}, ack_active={ack_active}"
)
# Continue processing - don't skip due to Ack
# Event passed all filtering criteria
return (True, None)
@Configuration(distributed=False)
class TrackMeStateful(GeneratingCommand):
tenant_id = Option(
doc="""
**Syntax:** **tenant_id=****
**Description:** The tenant identifier.""",
require=True,
default=None,
)
def generate(self, **kwargs):
# Start performance counter
start = time.time()
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
)
log.setLevel(reqinfo["logging_level"])
# Get earliest and latest times
earliest = self._metadata.searchinfo.earliest_time
latest = self._metadata.searchinfo.latest_time
# Get service
server_uri = self._metadata.searchinfo.splunkd_uri
session_key = self._metadata.searchinfo.session_key
service = client.connect(
owner="nobody",
app="trackme",
port=reqinfo["server_rest_port"],
token=session_key,
timeout=600,
)
# Build header for REST calls
header = {
"Authorization": f"Splunk {session_key}",
"Content-Type": "application/json",
}
# Retrieve stateful alert configuration to determine alerting states
# This includes the "orange_as_alerting_state" setting
alert_config = get_stateful_alert_config(service, self.tenant_id)
alerting_states = alert_config["alerting_states"]
orange_as_alerting_state = alert_config["orange_as_alerting_state"]
log.info(
f"activity=initialization, tenant_id={self.tenant_id}, decision=configured, "
f"reason=alerting_states_configured, alerting_states={alerting_states}, "
f"orange_as_alerting_state={orange_as_alerting_state}"
)
# Load ack collection once for efficient lookup (instead of REST calls per event)
# Use composite keys (object::object_category) to avoid collisions when same object exists in multiple categories
ack_collection_keys = None
ack_collection_dict = None
try:
ack_collection_name = f"kv_trackme_common_alerts_ack_tenant_{self.tenant_id}"
(
ack_records,
_,
_,
last_page,
) = search_kv_collection_sdkmode(
log, service, ack_collection_name, page=1, page_count=0, orderby="keyid"
)
# Build composite key dictionary to avoid collisions
ack_collection_dict = {}
ack_collection_keys = set()
for record in ack_records:
obj = record.get("object")
obj_cat = record.get("object_category")
if obj and obj_cat:
composite_key = f"{obj}::{obj_cat}"
ack_collection_dict[composite_key] = record
ack_collection_keys.add(composite_key)
log.info(
f"activity=initialization, tenant_id={self.tenant_id}, decision=loaded, "
f"reason=ack_collection_loaded_for_efficient_lookup, ack_records_count={len(ack_records)}, "
f"unique_composite_keys={len(ack_collection_keys)}"
)
except Exception as e:
log.warning(
f"activity=initialization, tenant_id={self.tenant_id}, decision=fallback, "
f"reason=ack_collection_load_failed_will_use_rest_calls, exception={str(e)}"
)
# Will fall back to REST calls per event if collection load fails
ack_collection_keys = None
ack_collection_dict = None
# Get maintenance status
try:
endpoint = f"{server_uri}/services/trackme/v2/maintenance/check_global_maintenance_status"
resp = requests.get(endpoint, headers=header, verify=False, timeout=60)
resp.raise_for_status()
maintenance_info = resp.json()
# Normalize tenants_scope
if isinstance(maintenance_info.get("tenants_scope"), str):
ts = maintenance_info.get("tenants_scope", "*").strip()
if ts == "" or ts == "*":
maintenance_info["tenants_scope"] = ["*"]
else:
maintenance_info["tenants_scope"] = [
s.strip() for s in ts.split(",") if s.strip()
]
elif not isinstance(maintenance_info.get("tenants_scope"), list):
maintenance_info["tenants_scope"] = ["*"]
except Exception as e:
log.error(
f"activity=maintenance_check, tenant_id={self.tenant_id}, decision=error, "
f"reason=maintenance_check_failed, exception={str(e)}"
)
maintenance_info = None
# Get bank holidays status
bank_holidays_info = None
try:
endpoint = f"{server_uri}/services/trackme/v2/bank_holidays/check_active"
resp = requests.get(endpoint, headers=header, verify=False, timeout=60)
resp.raise_for_status()
bank_holidays_info = resp.json()
log.info(
f"activity=bank_holidays_check, tenant_id={self.tenant_id}, decision=retrieved, "
f"reason=bank_holidays_status_retrieved, is_active={bank_holidays_info.get('payload', {}).get('is_active', False)}"
)
except Exception as e:
log.error(
f"activity=bank_holidays_check, tenant_id={self.tenant_id}, decision=error, "
f"reason=bank_holidays_check_failed, exception={str(e)}"
)
bank_holidays_info = None
# Build the simplified stateful alert search query
search_query = remove_leading_spaces(f"""\
search (`trackme_idx({self.tenant_id})` (sourcetype=trackme:state) tenant_id="{self.tenant_id}" object_category="*" monitored_state="enabled")
OR (`trackme_idx({self.tenant_id})` (sourcetype=trackme:flip) tenant_id="{self.tenant_id}" object_category="*" object_previous_state!="discovered")
OR (`trackme_idx({self.tenant_id})` (sourcetype=trackme:sla_breaches) tenant_id="{self.tenant_id}" object_category="*")
OR (`trackme_notable_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category=*)
""")
# Search parameters
search_kwargs = {
"earliest_time": earliest,
"latest_time": latest,
"count": 0,
"output_mode": "json",
}
log.info(
f"activity=initialization, tenant_id={self.tenant_id}, decision=start, reason=trackmestateful_command_started"
)
# Create a minimal helper-like object for filtering functions
class MinimalHelper:
def log_info(self, msg):
log.info(msg)
def log_debug(self, msg):
log.debug(msg)
def log_error(self, msg):
log.error(msg)
def log_warning(self, msg):
log.warning(msg)
minimal_helper = MinimalHelper()
# Initialize caches for performance optimization
# These caches avoid redundant KVstore queries and REST calls for the same object across multiple events
object_id_cache = {} # (object, object_category) -> object_id
object_state_cache = {} # object_id -> (object_state, anomaly_reason, status_message_json, monitored_state)
stateful_record_cache = {} # object_id -> stateful_record
validation_cache = {} # (object_id, alerting_states_tuple) -> ((is_valid, actual_object_state, error_message), timestamp)
score_cache = {} # object_id -> (score, score_definition)
log.info(
f"activity=initialization, tenant_id={self.tenant_id}, decision=caches_initialized, "
f"reason=performance_optimization_caches_created"
)
# Execute the search
try:
reader = run_splunk_search(
service, search_query, search_kwargs, 24, 5
)
events_processed = 0
events_yielded = 0
events_filtered = 0
events_passed_filters = 0 # Track events that passed all filters (before deduplication)
# Deduplication: Track latest event per (object_id, sourcetype) combination
# This prevents race conditions where multiple events for the same object
# in a single execution cause multiple updates (opened -> updated)
events_by_key = {} # key: (object_id, sourcetype) -> (yield_event, _time, context)
for item in reader:
if isinstance(item, dict):
events_processed += 1
# Get filtering context for this event
try:
event = item.get("_raw")
if not isinstance(event, dict):
if event is None:
raise ValueError("Event _raw field is None")
event = json.loads(event)
# get original Splunk metadata: index, sourcetype, host, source
index = item.get("index")
sourcetype = item.get("sourcetype")
host = item.get("host")
source = item.get("source")
yield_event = {
"_time": event.get("_time", time.time()),
"index": index,
"sourcetype": sourcetype,
"host": host,
"source": source,
"_raw": event,
}
for key, value in event.items():
yield_event[key] = value
context = get_event_filtering_context(
helper=minimal_helper,
service=service,
server_uri=server_uri,
header=header,
event=event,
maintenance_info=maintenance_info,
alerting_states=alerting_states,
ack_collection_keys=ack_collection_keys,
ack_collection_dict=ack_collection_dict,
bank_holidays_info=bank_holidays_info,
object_id_cache=object_id_cache,
object_state_cache=object_state_cache,
stateful_record_cache=stateful_record_cache,
)
# If context gathering failed, skip this event
if context is None:
events_filtered += 1
log.info(
f"activity=context_gathering, tenant_id={event.get('tenant_id')}, object={event.get('object')}, "
f"decision=skip, reason=context_gathering_failed"
)
continue
# Pre-filter: Check if event meets basic conditions for stateful backend
should_prefilter_yield, prefilter_reason = should_prefilter_yield_event(
event=event,
context=context,
alerting_states=alerting_states,
sourcetype=sourcetype,
)
if not should_prefilter_yield:
events_filtered += 1
log.info(
f"activity=pre_filter, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, "
f"decision=skip, reason={prefilter_reason if prefilter_reason else 'unknown_reason'}"
)
continue
# Event passed pre-filter, now apply full filtering logic
# (ack checks, maintenance checks, validation, etc.)
should_process, skip_reason = filter_stateful_alert_event(
helper=minimal_helper,
server_uri=server_uri,
header=header,
event=event,
context=context,
alerting_states=alerting_states,
validation_cache=validation_cache,
sourcetype=sourcetype,
)
if should_process:
# Event passed all filters - add to deduplication dict
events_passed_filters += 1
# Use (object_id, sourcetype) as key to deduplicate within this execution
object_id = context.get('object_id')
dedup_key = (object_id, sourcetype)
# Convert _time to float for consistent comparison (handles string epoch timestamps)
event_time = float(yield_event.get("_time", time.time()))
# Keep only the latest event per (object_id, sourcetype) combination
if dedup_key not in events_by_key:
events_by_key[dedup_key] = (yield_event, event_time, context, prefilter_reason)
log.debug(
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={object_id}, sourcetype={sourcetype}, decision=added, "
f"reason=first_event_for_object_sourcetype_combination, event_time={event_time}"
)
else:
existing_event, existing_time, existing_context, existing_reason = events_by_key[dedup_key]
# Ensure existing_time is also float for safe comparison
if not isinstance(existing_time, (int, float)):
existing_time = float(existing_time)
if event_time > existing_time:
# This event is newer, replace the existing one
events_by_key[dedup_key] = (yield_event, event_time, context, prefilter_reason)
log.debug(
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={object_id}, sourcetype={sourcetype}, decision=replaced, "
f"reason=newer_event_found, old_time={existing_time}, new_time={event_time}"
)
else:
log.debug(
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={object_id}, sourcetype={sourcetype}, decision=skipped, "
f"reason=older_event_ignored, existing_time={existing_time}, event_time={event_time}"
)
log.info(
f"activity=filtering, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, "
f"decision=yield, reason=passed_pre_filter_and_full_filter, prefilter_reason={prefilter_reason}"
)
else:
events_filtered += 1
log.info(
f"activity=filtering, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, "
f"decision=skip, reason={skip_reason if skip_reason else 'unknown_reason'}"
)
except Exception as e:
events_filtered += 1
# Safely extract event info for logging (event might not be a dict if JSON parsing failed)
tenant_id_info = "unknown"
object_info = "unknown"
try:
if isinstance(event, dict):
tenant_id_info = event.get('tenant_id', 'unknown')
object_info = event.get('object', 'unknown')
except Exception:
pass
log.error(
f"activity=event_processing, tenant_id={tenant_id_info}, object={object_info}, "
f"decision=error, reason=exception_during_event_processing, exception={str(e)}"
)
# On error, we skip the event (fail-safe behavior)
continue
# Deduplication: Extract only the latest event per (object_id, sourcetype) combination
# This prevents race conditions where multiple events for the same object in a single execution
# would cause multiple updates (opened -> updated) in the backend
events_yielded_records = []
events_yielded = len(events_by_key) # Count deduplicated events
for dedup_key, (yield_event, event_time, context, prefilter_reason) in events_by_key.items():
object_id, sourcetype = dedup_key
# Load score and score_definition from component data via REST call
tenant_id = context.get('tenant_id')
component = context.get('component')
# Check cache first to avoid redundant REST calls
score = None
score_definition = None
if object_id in score_cache:
score, score_definition = score_cache[object_id]
log.debug(
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
f"decision=cache_hit, score={score}"
)
elif tenant_id and component and object_id:
try:
# Build the REST endpoint URL
url = f"{server_uri}/services/trackme/v2/component/load_component_data"
# Prepare query parameters
params = {
"tenant_id": tenant_id,
"component": component,
"pagination_mode": "local",
"filter_key": object_id,
}
log.debug(
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
f"url={url}, params={params}"
)
# Make the REST call
response = requests.get(
url,
headers=header,
verify=False,
params=params,
timeout=30,
)
# Check response status
if response.status_code == 200:
try:
response_json = response.json()
# Handle different response structures:
# - If pagination_mode="local": response is {"payload": [records...], "status": 200}
# Splunk REST framework may unwrap to just [records...]
# - If pagination_mode="remote": response is {"payload": {"data": [records...], "last_page": ...}, "status": 200}
# Check if response is wrapped in "payload"
if isinstance(response_json, dict) and "payload" in response_json:
payload = response_json["payload"]
# Check if payload has "data" key (remote pagination) or is a list (local pagination)
if isinstance(payload, dict) and "data" in payload:
data = payload["data"]
elif isinstance(payload, list):
data = payload
else:
data = []
# Check if response is a list directly (local pagination, unwrapped)
elif isinstance(response_json, list):
data = response_json
# Check if response has "data" key directly
elif isinstance(response_json, dict) and "data" in response_json:
data = response_json["data"]
else:
data = []
if data and len(data) > 0:
# Get the first record (should be only one when filtering by object_id)
record = data[0]
# Extract score and score_definition
score_raw = record.get("score")
score_definition = record.get("score_definition")
# Convert score to integer (score is always an integer)
score = None
if score_raw is not None:
try:
score = int(float(score_raw)) # Convert to float first to handle string "100.0", then to int
except (ValueError, TypeError):
log.warning(
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
f"decision=error, reason=failed_to_convert_score_to_int, score_raw={score_raw}"
)
score = None
# Cache the results
score_cache[object_id] = (score, score_definition)
log.debug(
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
f"score={score}, decision=cached"
)
else:
# Cache None values when data is empty to avoid redundant REST calls
# This handles the case where REST call succeeds but no records found for object_id
score_cache[object_id] = (None, None)
log.debug(
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
f"decision=cached_empty, reason=no_data_returned_from_rest_call"
)
except (json.JSONDecodeError, KeyError, IndexError, TypeError) as e:
log.warning(
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
f"decision=error, reason=failed_to_parse_response, exception={str(e)}"
)
# Cache None values to avoid repeated failed calls
score_cache[object_id] = (None, None)
else:
log.warning(
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
f"decision=error, reason=rest_call_failed, status_code={response.status_code}, "
f"response_text={response.text[:200]}"
)
# Cache None values to avoid repeated failed calls
score_cache[object_id] = (None, None)
except Exception as e:
log.warning(
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
f"decision=error, reason=exception_during_rest_call, exception={str(e)}"
)
# Cache None values to avoid repeated failed calls
score_cache[object_id] = (None, None)
# Add to yield_event if present
if score is not None:
yield_event["score"] = score
if score_definition is not None:
yield_event["score_definition"] = score_definition
events_yielded_records.append(yield_event)
log.debug(
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
f"object_id={object_id}, sourcetype={sourcetype}, decision=final_yield, "
f"reason=latest_event_selected_for_object_sourcetype_combination, event_time={event_time}"
)
# Log deduplication summary if duplicates were found
if len(events_by_key) < events_passed_filters:
duplicates_removed = events_passed_filters - len(events_by_key)
log.info(
f"activity=deduplication, tenant_id={self.tenant_id}, decision=completed, "
f"reason=duplicate_events_deduplicated, events_passed_filters={events_passed_filters}, "
f"events_after_dedup={len(events_by_key)}, duplicates_removed={duplicates_removed}"
)
# yield events (now deduplicated)
for yield_record in generate_fields(events_yielded_records):
yield yield_record
log.info(
f"activity=completion, tenant_id={self.tenant_id}, decision=terminated, "
f"reason=trackmestateful_command_completed, run_time_seconds={round(time.time() - start, 3)}, "
f"events_processed={events_processed}, events_yielded={events_yielded}, events_filtered={events_filtered}, "
f"cache_stats=object_id_cache_size={len(object_id_cache)}, object_state_cache_size={len(object_state_cache)}, "
f"stateful_record_cache_size={len(stateful_record_cache)}, validation_cache_size={len(validation_cache)}, "
f"score_cache_size={len(score_cache)}"
)
except Exception as e:
log.error(
f"activity=execution, tenant_id={self.tenant_id}, decision=error, "
f"reason=trackmestateful_command_failed, exception={str(e)}"
)
# Raise exception for main search failures - don't yield error events
# Error events with missing required fields (object, object_category) would crash the downstream backend
raise Exception(
f"trackmestateful command failed for tenant_id={self.tenant_id}: {str(e)}"
) from e
dispatch(TrackMeStateful, sys.argv, sys.stdin, sys.stdout, __name__)