You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1414 lines
66 KiB
1414 lines
66 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
# Third-party library imports
|
|
import requests
|
|
import urllib3
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Disable insecure request warnings for urllib3
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# set splunkhome
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_stateful.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# Import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
GeneratingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# Import trackme libs
|
|
from trackme_libs import trackme_reqinfo, run_splunk_search
|
|
from trackme_libs_get_data import search_kv_collection_sdkmode
|
|
import splunklib.client as client
|
|
|
|
# Import trackme libs utils
|
|
from trackme_libs_utils import decode_unicode, normalize_anomaly_reason, remove_leading_spaces
|
|
|
|
# Import helper functions from stateful alert helper (these are utility functions, not filtering logic)
|
|
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "trackme"))
|
|
from modalert_trackme_stateful_alert_helper import (
|
|
get_keyid_from_main_kvstore,
|
|
get_object_state_from_main_kvstore,
|
|
get_stateful_records_for_object_id,
|
|
validate_object_state_via_rest,
|
|
)
|
|
|
|
def generate_fields(records):
|
|
"""
|
|
Fhis function ensures that records have the same list of fields to allow Splunk to automatically extract these fields
|
|
If a given result does not have a given field, it will be added to the record as an empty value
|
|
"""
|
|
all_keys = set()
|
|
for record in records:
|
|
all_keys.update(record.keys())
|
|
|
|
for record in records:
|
|
for key in all_keys:
|
|
if key not in record:
|
|
record[key] = ""
|
|
yield record
|
|
|
|
def get_event_filtering_context(
|
|
helper,
|
|
service,
|
|
server_uri,
|
|
header,
|
|
event,
|
|
maintenance_info,
|
|
alerting_states,
|
|
ack_collection_keys=None,
|
|
ack_collection_dict=None,
|
|
bank_holidays_info=None,
|
|
object_id_cache=None,
|
|
object_state_cache=None,
|
|
stateful_record_cache=None,
|
|
):
|
|
"""
|
|
Function to gather all context needed for filtering a stateful alert event.
|
|
This includes object_id, object_state, stateful_record, ack status, etc.
|
|
|
|
Args:
|
|
helper: The helper object (can be None for custom command usage)
|
|
service: The Splunk service object
|
|
server_uri: The server URI
|
|
header: The authorization header
|
|
event: The event dictionary
|
|
maintenance_info: The maintenance info dictionary
|
|
alerting_states: List of alerting states (e.g., ["red", "orange"])
|
|
ack_collection_keys: Set of object names that have acks (for efficient lookup)
|
|
ack_collection_dict: Dictionary mapping object -> ack_record (for efficient lookup)
|
|
|
|
Returns:
|
|
dict: Context dictionary with all filtering-related information
|
|
"""
|
|
# Helper function for logging
|
|
def log_info(msg):
|
|
if helper:
|
|
helper.log_info(msg)
|
|
else:
|
|
log.info(msg)
|
|
|
|
def log_debug(msg):
|
|
if helper:
|
|
helper.log_debug(msg)
|
|
else:
|
|
log.debug(msg)
|
|
|
|
def log_error(msg):
|
|
if helper:
|
|
helper.log_error(msg)
|
|
else:
|
|
log.error(msg)
|
|
|
|
def log_warning(msg):
|
|
if helper:
|
|
helper.log_warning(msg)
|
|
else:
|
|
log.warning(msg)
|
|
|
|
# get tenant_id
|
|
tenant_id = event["tenant_id"]
|
|
|
|
# get alias, object, object_category, priority
|
|
alias = event.get("alias", None)
|
|
|
|
# get object, we may have to deal with problematic non ASCII chars
|
|
object = decode_unicode(event["object"])
|
|
|
|
object_category = event["object_category"]
|
|
priority = event.get("priority", "medium")
|
|
|
|
# get component from object_category (splk-<component>)
|
|
component = object_category.split("-")[1]
|
|
|
|
# check if the ack is active for the object using KVstore lookup
|
|
ack_active = False
|
|
ack_age = 0
|
|
|
|
# Use KVstore lookup if ack collection data is provided, otherwise fall back to REST call
|
|
if ack_collection_keys is not None and ack_collection_dict is not None:
|
|
# Efficient KVstore lookup using composite key (object::object_category)
|
|
composite_key = f"{object}::{object_category}"
|
|
if composite_key in ack_collection_keys:
|
|
ack_record = ack_collection_dict.get(composite_key)
|
|
# Record should match since we use composite key
|
|
if ack_record:
|
|
ack_active_string = ack_record.get("ack_state", "inactive")
|
|
ack_mtime_raw = ack_record.get("ack_mtime", None)
|
|
if ack_mtime_raw:
|
|
try:
|
|
ack_mtime = float(ack_mtime_raw)
|
|
except (ValueError, TypeError):
|
|
ack_mtime = time.time()
|
|
else:
|
|
ack_mtime = time.time()
|
|
|
|
if ack_active_string == "active":
|
|
ack_age = time.time() - ack_mtime
|
|
ack_active = True
|
|
else:
|
|
ack_active = False
|
|
ack_age = 0
|
|
else:
|
|
ack_active = False
|
|
ack_age = 0
|
|
else:
|
|
ack_active = False
|
|
ack_age = 0
|
|
|
|
# Log the result (object_id will be available after we get it from KVstore)
|
|
if ack_active:
|
|
log_info(
|
|
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_active, "
|
|
f"reason=acknowledgment_is_active, ack_age_seconds={ack_age:.2f}"
|
|
)
|
|
else:
|
|
log_info(
|
|
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_inactive, "
|
|
f"reason=no_active_acknowledgment"
|
|
)
|
|
else:
|
|
# Fallback to REST call if ack collection data not provided
|
|
try:
|
|
ack_response = requests.post(
|
|
f"{server_uri}/services/trackme/v2/ack/get_ack_for_object",
|
|
headers=header,
|
|
data=json.dumps(
|
|
{
|
|
"tenant_id": tenant_id,
|
|
"object_category": object_category,
|
|
"object_list": object,
|
|
}
|
|
),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
ack_response.raise_for_status()
|
|
ack_response_json = ack_response.json()
|
|
|
|
if ack_response_json:
|
|
ack_response_json = ack_response_json[0]
|
|
ack_active_string = ack_response_json.get("ack_state", "inactive")
|
|
ack_mtime = float(ack_response_json.get("ack_mtime", time.time()))
|
|
if ack_active_string == "active":
|
|
ack_age = time.time() - ack_mtime
|
|
ack_active = True
|
|
log_info(
|
|
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_active, "
|
|
f"reason=acknowledgment_is_active, ack_age_seconds={ack_age:.2f}"
|
|
)
|
|
else:
|
|
ack_active = False
|
|
ack_age = 0
|
|
log_info(
|
|
f"activity=ack_check, tenant_id={tenant_id}, object={object}, decision=ack_inactive, "
|
|
f"reason=no_active_acknowledgment"
|
|
)
|
|
|
|
except Exception as e:
|
|
log_error(
|
|
f"activity=ack_check, tenant_id={tenant_id}, object={object}, object_category={object_category}, "
|
|
f"decision=error, reason=exception_during_ack_retrieval, exception={str(e)}"
|
|
)
|
|
ack_active = False
|
|
ack_age = 0
|
|
|
|
# connect to the main tenant KVstore collection
|
|
collection_main_name = f"kv_trackme_{component}_tenant_{tenant_id}"
|
|
collection_main = service.kvstore[collection_main_name]
|
|
|
|
# connect to the stateful KVstore collection
|
|
collection_stateful_alerting_name = (
|
|
f"kv_trackme_stateful_alerting_tenant_{tenant_id}"
|
|
)
|
|
collection_stateful_alerting = service.kvstore[
|
|
collection_stateful_alerting_name
|
|
]
|
|
|
|
# get object_id, if not part of the upstream event, get it from the main KVstore
|
|
# try both "keyid" and "key" fields as some events use different field names
|
|
object_id = event.get("keyid", None)
|
|
if not object_id:
|
|
object_id = event.get("key", None)
|
|
if not object_id:
|
|
# Use cache if available
|
|
cache_key = (object, object_category)
|
|
if object_id_cache is not None and cache_key in object_id_cache:
|
|
object_id = object_id_cache[cache_key]
|
|
else:
|
|
object_id = get_keyid_from_main_kvstore(helper, collection_main, object)
|
|
if object_id_cache is not None:
|
|
object_id_cache[cache_key] = object_id
|
|
if not object_id:
|
|
log_error(
|
|
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_category={object_category}, "
|
|
f"object_id=None, decision=skip, reason=no_object_id_found_for_object"
|
|
)
|
|
return None
|
|
|
|
# get the object_state from the main KVstore collection (use cache if available)
|
|
if object_state_cache is not None and object_id in object_state_cache:
|
|
(
|
|
collection_object_state,
|
|
collection_anomaly_reason,
|
|
collection_status_message_json,
|
|
collection_monitored_state,
|
|
) = object_state_cache[object_id]
|
|
else:
|
|
(
|
|
collection_object_state,
|
|
collection_anomaly_reason,
|
|
collection_status_message_json,
|
|
collection_monitored_state,
|
|
) = get_object_state_from_main_kvstore(helper, collection_main, object_id)
|
|
if object_state_cache is not None:
|
|
object_state_cache[object_id] = (
|
|
collection_object_state,
|
|
collection_anomaly_reason,
|
|
collection_status_message_json,
|
|
collection_monitored_state,
|
|
)
|
|
|
|
# get object_state, collection_anomaly_reason and status_message_json from the event, if not present fallback to the collection values
|
|
event_object_state = event.get("object_state", collection_object_state)
|
|
event_anomaly_reason = event.get("anomaly_reason", collection_anomaly_reason)
|
|
# normalize the anomaly_reason
|
|
event_anomaly_reason = normalize_anomaly_reason(event_anomaly_reason)
|
|
event_status_message_json = event.get(
|
|
"status_message_json", collection_status_message_json
|
|
)
|
|
event_monitored_state = event.get("monitored_state", collection_monitored_state)
|
|
|
|
# merge
|
|
object_state = (
|
|
event_object_state if event_object_state else collection_object_state
|
|
)
|
|
anomaly_reason = (
|
|
event_anomaly_reason if event_anomaly_reason else collection_anomaly_reason
|
|
)
|
|
status_message_json = (
|
|
event_status_message_json
|
|
if event_status_message_json
|
|
else collection_status_message_json
|
|
)
|
|
|
|
# for monitored_state, prefer the collection value instead of the event value
|
|
monitored_state = (
|
|
collection_monitored_state
|
|
if collection_monitored_state
|
|
else event_monitored_state
|
|
)
|
|
|
|
# cannot process if object_state is None
|
|
if not object_state:
|
|
log_info(
|
|
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
|
|
f"object_state=None, decision=skip, reason=failed_to_retrieve_object_state_from_kvstore"
|
|
)
|
|
return None
|
|
|
|
# get the stateful record, if any (use cache if available)
|
|
if stateful_record_cache is not None and object_id in stateful_record_cache:
|
|
stateful_record = stateful_record_cache[object_id]
|
|
else:
|
|
stateful_record = get_stateful_records_for_object_id(
|
|
helper, collection_stateful_alerting, object_id
|
|
)
|
|
if stateful_record_cache is not None:
|
|
stateful_record_cache[object_id] = stateful_record
|
|
if stateful_record:
|
|
stateful_record_state = stateful_record.get("object_state", "unknown")
|
|
stateful_record_incident_id = stateful_record.get("incident_id", "unknown")
|
|
stateful_record_alert_status = stateful_record.get("alert_status", "unknown")
|
|
log_info(
|
|
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
|
|
f"object_state={object_state}, decision=stateful_record_found, "
|
|
f"reason=active_stateful_record_exists, stateful_record_state={stateful_record_state}, "
|
|
f"incident_id={stateful_record_incident_id}, alert_status={stateful_record_alert_status}"
|
|
)
|
|
else:
|
|
log_info(
|
|
f"activity=context_gathering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
|
|
f"object_state={object_state}, decision=no_stateful_record, reason=no_active_stateful_record_found"
|
|
)
|
|
|
|
# Helper function for tenant_in_scope
|
|
def tenant_in_scope(tenant_id, tenants_scope):
|
|
try:
|
|
if isinstance(tenants_scope, list):
|
|
scope_list = tenants_scope
|
|
elif isinstance(tenants_scope, str):
|
|
ts = tenants_scope.strip()
|
|
if ts == "" or ts == "*":
|
|
scope_list = ["*"]
|
|
else:
|
|
scope_list = [s.strip() for s in ts.split(",") if s.strip()]
|
|
else:
|
|
scope_list = ["*"]
|
|
except Exception:
|
|
scope_list = ["*"]
|
|
if "*" in scope_list:
|
|
return True
|
|
return tenant_id in scope_list
|
|
|
|
# Check maintenance status
|
|
maintenance_active = False
|
|
try:
|
|
if maintenance_info and maintenance_info.get("maintenance"):
|
|
if tenant_in_scope(tenant_id, maintenance_info.get("tenants_scope", ["*"])):
|
|
maintenance_active = True
|
|
except Exception:
|
|
maintenance_active = False
|
|
|
|
# Check bank holidays status
|
|
bank_holidays_active = False
|
|
try:
|
|
if bank_holidays_info:
|
|
payload = bank_holidays_info.get("payload", bank_holidays_info)
|
|
if payload.get("is_active", False):
|
|
bank_holidays_active = True
|
|
except Exception:
|
|
bank_holidays_active = False
|
|
|
|
# Get event time
|
|
event_time = float(event.get("_time", time.time()))
|
|
|
|
# Get mtime from stateful_record if it exists
|
|
mtime = None
|
|
if stateful_record:
|
|
mtime_raw = stateful_record.get("mtime")
|
|
if mtime_raw is not None:
|
|
try:
|
|
mtime = float(mtime_raw)
|
|
except (ValueError, TypeError) as e:
|
|
log_warning(
|
|
f"stateful_record exists but mtime cannot be converted to float: tenant_id={tenant_id}, object={object}, object_id={object_id}, mtime_raw={mtime_raw}, error={str(e)}. Skipping mtime check."
|
|
)
|
|
mtime = None
|
|
|
|
return {
|
|
"tenant_id": tenant_id,
|
|
"object": object,
|
|
"object_category": object_category,
|
|
"object_id": object_id,
|
|
"object_state": object_state,
|
|
"monitored_state": monitored_state,
|
|
"component": component,
|
|
"ack_active": ack_active,
|
|
"ack_age": ack_age,
|
|
"stateful_record": stateful_record,
|
|
"maintenance_active": maintenance_active,
|
|
"bank_holidays_active": bank_holidays_active,
|
|
"event_time": event_time,
|
|
"mtime": mtime,
|
|
"priority": priority,
|
|
"anomaly_reason": anomaly_reason,
|
|
}
|
|
|
|
|
|
def get_stateful_alert_config(service, tenant_id):
|
|
"""
|
|
Retrieve the stateful alert configuration for a tenant.
|
|
Specifically, finds the stateful alert and extracts the orange_as_alerting_state parameter.
|
|
|
|
Args:
|
|
service: The Splunk service object
|
|
tenant_id: The tenant identifier
|
|
|
|
Returns:
|
|
dict: Configuration dict with 'orange_as_alerting_state' (int, 0 or 1) and 'alerting_states' (list)
|
|
"""
|
|
try:
|
|
# Search for stateful alerts for this tenant
|
|
# Alert name pattern: "TrackMe alert tenant_id:{tenant_id} - *"
|
|
alert_name_pattern = f"TrackMe alert tenant_id:{tenant_id} -"
|
|
|
|
# Get all saved searches
|
|
saved_searches = service.saved_searches
|
|
|
|
# Find the stateful alert
|
|
# Note: Iterating over saved_searches yields SavedSearch objects, not strings
|
|
# We need to access the .name property to get the string name
|
|
stateful_alert = None
|
|
for alert_obj in saved_searches:
|
|
alert_name = alert_obj.name
|
|
if alert_name.startswith(alert_name_pattern):
|
|
# Check if this alert has trackme_stateful_alert action
|
|
actions = alert_obj.content.get("actions", "")
|
|
if isinstance(actions, str):
|
|
actions_list = [a.strip() for a in actions.split(",") if a.strip()]
|
|
elif isinstance(actions, list):
|
|
actions_list = actions
|
|
else:
|
|
actions_list = []
|
|
|
|
if "trackme_stateful_alert" in actions_list:
|
|
stateful_alert = alert_obj
|
|
log.info(
|
|
f"activity=config_retrieval, tenant_id={tenant_id}, decision=found, "
|
|
f"reason=stateful_alert_found, alert_name={alert_name}"
|
|
)
|
|
break
|
|
|
|
if not stateful_alert:
|
|
log.info(
|
|
f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, "
|
|
f"reason=no_stateful_alert_found"
|
|
)
|
|
# Default: orange is not considered an alerting state
|
|
return {
|
|
"orange_as_alerting_state": 0,
|
|
"alerting_states": ["red"]
|
|
}
|
|
|
|
# Extract orange_as_alerting_state parameter
|
|
orange_as_alerting_state = 0
|
|
try:
|
|
param_value = stateful_alert.content.get("action.trackme_stateful_alert.param.orange_as_alerting_state")
|
|
if param_value is not None:
|
|
orange_as_alerting_state = int(param_value)
|
|
except (ValueError, TypeError) as e:
|
|
log.warning(
|
|
f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, "
|
|
f"reason=failed_to_parse_orange_as_alerting_state, exception={str(e)}"
|
|
)
|
|
orange_as_alerting_state = 0
|
|
|
|
# Determine alerting states based on configuration
|
|
alerting_states = ["red"]
|
|
if orange_as_alerting_state:
|
|
alerting_states.append("orange")
|
|
|
|
log.info(
|
|
f"activity=config_retrieval, tenant_id={tenant_id}, decision=retrieved, "
|
|
f"reason=stateful_alert_config_retrieved, orange_as_alerting_state={orange_as_alerting_state}, "
|
|
f"alerting_states={alerting_states}"
|
|
)
|
|
|
|
return {
|
|
"orange_as_alerting_state": orange_as_alerting_state,
|
|
"alerting_states": alerting_states
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(
|
|
f"activity=config_retrieval, tenant_id={tenant_id}, decision=use_default, "
|
|
f"reason=error_retrieving_stateful_alert_configuration, exception={str(e)}"
|
|
)
|
|
# On error, use default configuration
|
|
return {
|
|
"orange_as_alerting_state": 0,
|
|
"alerting_states": ["red"]
|
|
}
|
|
|
|
|
|
def should_prefilter_yield_event(event, context, alerting_states, sourcetype):
|
|
"""
|
|
Pre-filtering function to determine if an event meets the basic conditions
|
|
for being investigated by the stateful backend.
|
|
|
|
Conditions:
|
|
1. No stateful record AND in alerting state (taking into account orange_as_alerting_state config)
|
|
2. Has stateful record BUT state changed (closure, update, or reopening)
|
|
3. SLA breach events (sourcetype=trackme:sla_breaches) - yield as updates if stateful record exists
|
|
|
|
Args:
|
|
event: The event dictionary
|
|
context: The context dictionary from get_event_filtering_context()
|
|
alerting_states: List of alerting states (e.g., ["red", "orange"])
|
|
sourcetype: The sourcetype of the event
|
|
|
|
Returns:
|
|
tuple: (should_yield: bool, reason: str or None)
|
|
"""
|
|
if context is None:
|
|
return (False, "Failed to gather event context")
|
|
|
|
stateful_record = context.get("stateful_record")
|
|
object_state = context.get("object_state")
|
|
|
|
# Condition 3: SLA breach events - yield if there's a stateful record (update scenario)
|
|
# or if in alerting state (new incident scenario)
|
|
if sourcetype == "trackme:sla_breaches":
|
|
if stateful_record:
|
|
return (True, "SLA breach event with existing stateful record - yield as update")
|
|
elif object_state in alerting_states:
|
|
return (True, f"SLA breach event in alerting state - object_state={object_state} is in alerting_states={alerting_states}")
|
|
else:
|
|
return (False, f"SLA breach event but object_state={object_state} is not in alerting_states={alerting_states}")
|
|
|
|
# Condition 1: No stateful record AND in alerting state
|
|
if not stateful_record:
|
|
if object_state in alerting_states:
|
|
return (True, f"No stateful record and object_state={object_state} is in alerting_states={alerting_states}")
|
|
else:
|
|
return (False, f"No stateful record but object_state={object_state} is not in alerting_states={alerting_states}")
|
|
|
|
# Condition 2: Has stateful record BUT state changed (closure, update, or reopening)
|
|
# Always yield when state changes, regardless of sourcetype
|
|
stateful_record_object_state = stateful_record.get("object_state")
|
|
if stateful_record_object_state != object_state:
|
|
return (True, f"State changed: previous_state={stateful_record_object_state}, new_state={object_state}")
|
|
|
|
# Condition 3: Has stateful record, state unchanged, but sourcetype indicates discrete event
|
|
# For discrete events (SLA breaches, notables, flips), yield updates even if state unchanged
|
|
# For continuous events (trackme:state), skip to avoid repeated updates
|
|
|
|
# Check for flip events (state transitions) - discrete events
|
|
if sourcetype == "trackme:flip":
|
|
return (True, f"Flip event with existing stateful record - yield as update even if state unchanged")
|
|
|
|
# Check for notable events (sourcetype might be from trackme_notable_idx) - discrete events
|
|
if sourcetype and "notable" in sourcetype.lower():
|
|
return (True, f"Notable event with existing stateful record - yield as update even if state unchanged")
|
|
|
|
# For trackme:state (continuous events), don't yield if state unchanged
|
|
# This prevents repeated updates for entities that remain in the same alerting state
|
|
# Continuous events should only create updates when state actually changes
|
|
if sourcetype == "trackme:state":
|
|
mtime = context.get("mtime")
|
|
event_time = context.get("event_time")
|
|
if mtime is not None and event_time is not None:
|
|
return (False, f"Has stateful record, state unchanged, trackme:state continuous event - skipping to avoid duplicate updates: object_state={object_state}, event_time={event_time}, mtime={mtime}")
|
|
return (False, f"Has stateful record and state unchanged: object_state={object_state}, sourcetype={sourcetype}")
|
|
|
|
# For other sourcetypes (including SLA breaches handled earlier, but as fallback), yield if state unchanged
|
|
# Discrete events should create updates even when state hasn't changed
|
|
return (True, f"Has stateful record, state unchanged, but discrete event type - yield as update: object_state={object_state}, sourcetype={sourcetype}")
|
|
|
|
|
|
def filter_stateful_alert_event(
|
|
helper,
|
|
server_uri,
|
|
header,
|
|
event,
|
|
context,
|
|
alerting_states,
|
|
validation_cache=None,
|
|
sourcetype=None,
|
|
):
|
|
"""
|
|
Function to filter a stateful alert event based on all filtering criteria.
|
|
Returns True if the event should be processed, False if it should be skipped.
|
|
|
|
Args:
|
|
helper: The helper object (can be None for custom command usage)
|
|
server_uri: The server URI
|
|
header: The authorization header
|
|
event: The event dictionary
|
|
context: The context dictionary from get_event_filtering_context()
|
|
alerting_states: List of alerting states (e.g., ["red", "orange"])
|
|
|
|
Returns:
|
|
tuple: (should_process: bool, skip_reason: str or None)
|
|
"""
|
|
if context is None:
|
|
return (False, "Failed to gather event context")
|
|
|
|
# Helper function for logging
|
|
def log_info(msg):
|
|
if helper:
|
|
helper.log_info(msg)
|
|
else:
|
|
log.info(msg)
|
|
|
|
tenant_id = context["tenant_id"]
|
|
object = context["object"]
|
|
object_id = context["object_id"]
|
|
object_state = context["object_state"]
|
|
monitored_state = context["monitored_state"]
|
|
component = context["component"]
|
|
ack_active = context["ack_active"]
|
|
ack_age = context["ack_age"]
|
|
stateful_record = context["stateful_record"]
|
|
maintenance_active = context["maintenance_active"]
|
|
bank_holidays_active = context.get("bank_holidays_active", False)
|
|
event_time = context["event_time"]
|
|
mtime = context["mtime"]
|
|
|
|
# if the monitored_state is "disabled", we need to skip the processing
|
|
if monitored_state == "disabled":
|
|
return (
|
|
False,
|
|
f'monitored_state is disabled: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}',
|
|
)
|
|
|
|
# if the stateful record is not found, we need to check if we should create a new thread
|
|
if not stateful_record:
|
|
if maintenance_active and object_state in alerting_states:
|
|
return (
|
|
False,
|
|
f"maintenance active, skipping new incident creation: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
|
|
)
|
|
if bank_holidays_active and object_state in alerting_states:
|
|
return (
|
|
False,
|
|
f"bank holidays active, skipping new incident creation: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
|
|
)
|
|
# Check if ack is active and older than 5 minutes - if yes, skip creating new incident
|
|
if ack_active and ack_age > 300: # 300 seconds = 5 minutes
|
|
return (
|
|
False,
|
|
f'Ack is active and older than 5 minutes: tenant_id={tenant_id}, object={object}, object_id={object_id}, ack_active=True, ack_age={ack_age:.2f} seconds',
|
|
)
|
|
|
|
# Safety check: Validate object_state via REST call before accepting entity for alert
|
|
# This ensures the entity is actually in an alerting state according to the decision maker
|
|
if object_state in alerting_states:
|
|
# Use cache if available (with short TTL since state can change)
|
|
cache_key = (object_id, tuple(sorted(alerting_states)))
|
|
cache_ttl = 5 # Cache validation results for 5 seconds
|
|
current_time = time.time()
|
|
|
|
if validation_cache is not None and cache_key in validation_cache:
|
|
cached_result, cached_time = validation_cache[cache_key]
|
|
if current_time - cached_time < cache_ttl:
|
|
is_valid, actual_object_state, error_message = cached_result
|
|
else:
|
|
# Cache expired, refresh
|
|
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
|
|
helper=helper,
|
|
server_uri=server_uri,
|
|
header=header,
|
|
tenant_id=tenant_id,
|
|
component=component,
|
|
object_id=object_id,
|
|
alerting_states=alerting_states,
|
|
)
|
|
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
|
|
else:
|
|
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
|
|
helper=helper,
|
|
server_uri=server_uri,
|
|
header=header,
|
|
tenant_id=tenant_id,
|
|
component=component,
|
|
object_id=object_id,
|
|
alerting_states=alerting_states,
|
|
)
|
|
if validation_cache is not None:
|
|
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
|
|
|
|
if not is_valid:
|
|
# Skip this event - the actual object_state from the decision maker is not in alerting_states
|
|
return (
|
|
False,
|
|
f"Object state validation failed: actual_object_state={actual_object_state}, expected_states={alerting_states}: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
|
|
)
|
|
|
|
# do not process if no stateful_record and object_state is green/blue
|
|
if object_state not in alerting_states:
|
|
return (
|
|
False,
|
|
f'no stateful record and non-alerting state: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_id={event.get("event_id")}',
|
|
)
|
|
|
|
# check if the event should be skipped (for existing stateful records)
|
|
if stateful_record:
|
|
# Check if the current event's message_source_id (event_id) matches the one used for opened status
|
|
# If they match, we should not allow an updated status
|
|
event_id = event.get("event_id")
|
|
stateful_record_message_source_id = stateful_record.get("message_source_id")
|
|
if stateful_record_message_source_id and event_id and stateful_record_message_source_id == event_id:
|
|
# The event_id (message_source_id) is the same as the one used for opened status
|
|
# Skip processing this event to prevent duplicate updates
|
|
return (
|
|
False,
|
|
f"event_id matches the one used for opened status, preventing duplicate update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_id={event_id}",
|
|
)
|
|
|
|
# Additional check: If the action would be updated BUT the sourcetype is trackme:state or trackme:flip
|
|
# AND anomaly_reason has not changed, we should skip the updated event
|
|
if sourcetype in ("trackme:state", "trackme:flip") and object_state in alerting_states:
|
|
# Get current event's anomaly_reason from context (already normalized)
|
|
current_anomaly_reason = context.get("anomaly_reason", [])
|
|
if not isinstance(current_anomaly_reason, list):
|
|
current_anomaly_reason = normalize_anomaly_reason(current_anomaly_reason)
|
|
|
|
# Get stored anomaly_reason from stateful record (merge opened and updated)
|
|
stored_opened_anomaly_reason = normalize_anomaly_reason(stateful_record.get("opened_anomaly_reason", []))
|
|
stored_updated_anomaly_reason = normalize_anomaly_reason(stateful_record.get("updated_anomaly_reason", []))
|
|
# Merge stored anomaly reasons (same logic as in helper)
|
|
stored_anomaly_reason = list(set(stored_opened_anomaly_reason + stored_updated_anomaly_reason))
|
|
|
|
# Compare anomaly_reason lists (order doesn't matter, so compare as sets)
|
|
if isinstance(current_anomaly_reason, list) and isinstance(stored_anomaly_reason, list):
|
|
if set(current_anomaly_reason) == set(stored_anomaly_reason):
|
|
# Anomaly reason hasn't changed, skip the update
|
|
return (
|
|
False,
|
|
f"anomaly_reason unchanged for trackme:state/trackme:flip event, skipping update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, sourcetype={sourcetype}, anomaly_reason={current_anomaly_reason}",
|
|
)
|
|
|
|
# Get the object_state from the stateful record for comparison
|
|
stateful_record_object_state = stateful_record.get("object_state")
|
|
|
|
# Check if the state has changed - if it has, we should process even with active Ack
|
|
state_changed = stateful_record_object_state != object_state
|
|
|
|
# if we have a stateful record with mtime, and the event time is not newer than the mtime, we can skip the processing
|
|
# UNLESS the state has changed (in which case we need to process to update/close the incident)
|
|
if mtime is not None and event_time <= mtime and not state_changed:
|
|
return (
|
|
False,
|
|
f'event is not newer than stateful record last update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}, event_time={time.strftime("%c", time.localtime(event_time))}, mtime={time.strftime("%c", time.localtime(mtime))}',
|
|
)
|
|
|
|
# Early exit checks: If state hasn't changed and we're going to skip due to maintenance/bank holidays,
|
|
# skip without validation to avoid unnecessary REST calls
|
|
if not state_changed:
|
|
if maintenance_active and object_state in alerting_states:
|
|
return (
|
|
False,
|
|
f"maintenance active, skipping incident update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
|
|
)
|
|
if bank_holidays_active and object_state in alerting_states:
|
|
return (
|
|
False,
|
|
f"bank holidays active, skipping incident update: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
|
|
)
|
|
|
|
# Safety check: Validate object_state via REST call before accepting entity for alert update
|
|
# This ensures the entity is actually in an alerting state according to the decision maker
|
|
# Only validate if:
|
|
# 1. Object is in alerting state AND
|
|
# 2. We're actually going to process it (state changed OR not in maintenance/bank holidays)
|
|
if object_state in alerting_states:
|
|
# Use cache if available (with short TTL since state can change)
|
|
cache_key = (object_id, tuple(sorted(alerting_states)))
|
|
cache_ttl = 5 # Cache validation results for 5 seconds
|
|
current_time = time.time()
|
|
|
|
if validation_cache is not None and cache_key in validation_cache:
|
|
cached_result, cached_time = validation_cache[cache_key]
|
|
if current_time - cached_time < cache_ttl:
|
|
is_valid, actual_object_state, error_message = cached_result
|
|
else:
|
|
# Cache expired, refresh
|
|
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
|
|
helper=helper,
|
|
server_uri=server_uri,
|
|
header=header,
|
|
tenant_id=tenant_id,
|
|
component=component,
|
|
object_id=object_id,
|
|
alerting_states=alerting_states,
|
|
)
|
|
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
|
|
else:
|
|
is_valid, actual_object_state, error_message = validate_object_state_via_rest(
|
|
helper=helper,
|
|
server_uri=server_uri,
|
|
header=header,
|
|
tenant_id=tenant_id,
|
|
component=component,
|
|
object_id=object_id,
|
|
alerting_states=alerting_states,
|
|
)
|
|
if validation_cache is not None:
|
|
validation_cache[cache_key] = ((is_valid, actual_object_state, error_message), current_time)
|
|
|
|
if not is_valid:
|
|
# Skip this event - the actual object_state from the decision maker is not in alerting_states
|
|
return (
|
|
False,
|
|
f"Object state validation failed: actual_object_state={actual_object_state}, expected_states={alerting_states}: tenant_id={tenant_id}, object={object}, object_id={object_id}, object_state={object_state}",
|
|
)
|
|
|
|
# IMPORTANT: If state has changed, process the event even if Ack is active
|
|
# This ensures incidents are properly opened/updated/closed when state changes occur
|
|
if state_changed:
|
|
log_info(
|
|
f"activity=filtering, tenant_id={tenant_id}, object={object}, object_id={object_id}, "
|
|
f"object_state={object_state}, decision=process, reason=state_changed, "
|
|
f"previous_state={stateful_record_object_state}, new_state={object_state}, ack_active={ack_active}"
|
|
)
|
|
# Continue processing - don't skip due to Ack
|
|
|
|
# Event passed all filtering criteria
|
|
return (True, None)
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class TrackMeStateful(GeneratingCommand):
|
|
tenant_id = Option(
|
|
doc="""
|
|
**Syntax:** **tenant_id=****
|
|
**Description:** The tenant identifier.""",
|
|
require=True,
|
|
default=None,
|
|
)
|
|
|
|
def generate(self, **kwargs):
|
|
# Start performance counter
|
|
start = time.time()
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# Get earliest and latest times
|
|
earliest = self._metadata.searchinfo.earliest_time
|
|
latest = self._metadata.searchinfo.latest_time
|
|
|
|
# Get service
|
|
server_uri = self._metadata.searchinfo.splunkd_uri
|
|
session_key = self._metadata.searchinfo.session_key
|
|
service = client.connect(
|
|
owner="nobody",
|
|
app="trackme",
|
|
port=reqinfo["server_rest_port"],
|
|
token=session_key,
|
|
timeout=600,
|
|
)
|
|
|
|
# Build header for REST calls
|
|
header = {
|
|
"Authorization": f"Splunk {session_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Retrieve stateful alert configuration to determine alerting states
|
|
# This includes the "orange_as_alerting_state" setting
|
|
alert_config = get_stateful_alert_config(service, self.tenant_id)
|
|
alerting_states = alert_config["alerting_states"]
|
|
orange_as_alerting_state = alert_config["orange_as_alerting_state"]
|
|
|
|
log.info(
|
|
f"activity=initialization, tenant_id={self.tenant_id}, decision=configured, "
|
|
f"reason=alerting_states_configured, alerting_states={alerting_states}, "
|
|
f"orange_as_alerting_state={orange_as_alerting_state}"
|
|
)
|
|
|
|
# Load ack collection once for efficient lookup (instead of REST calls per event)
|
|
# Use composite keys (object::object_category) to avoid collisions when same object exists in multiple categories
|
|
ack_collection_keys = None
|
|
ack_collection_dict = None
|
|
try:
|
|
ack_collection_name = f"kv_trackme_common_alerts_ack_tenant_{self.tenant_id}"
|
|
(
|
|
ack_records,
|
|
_,
|
|
_,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
log, service, ack_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
# Build composite key dictionary to avoid collisions
|
|
ack_collection_dict = {}
|
|
ack_collection_keys = set()
|
|
for record in ack_records:
|
|
obj = record.get("object")
|
|
obj_cat = record.get("object_category")
|
|
if obj and obj_cat:
|
|
composite_key = f"{obj}::{obj_cat}"
|
|
ack_collection_dict[composite_key] = record
|
|
ack_collection_keys.add(composite_key)
|
|
log.info(
|
|
f"activity=initialization, tenant_id={self.tenant_id}, decision=loaded, "
|
|
f"reason=ack_collection_loaded_for_efficient_lookup, ack_records_count={len(ack_records)}, "
|
|
f"unique_composite_keys={len(ack_collection_keys)}"
|
|
)
|
|
except Exception as e:
|
|
log.warning(
|
|
f"activity=initialization, tenant_id={self.tenant_id}, decision=fallback, "
|
|
f"reason=ack_collection_load_failed_will_use_rest_calls, exception={str(e)}"
|
|
)
|
|
# Will fall back to REST calls per event if collection load fails
|
|
ack_collection_keys = None
|
|
ack_collection_dict = None
|
|
|
|
# Get maintenance status
|
|
try:
|
|
endpoint = f"{server_uri}/services/trackme/v2/maintenance/check_global_maintenance_status"
|
|
resp = requests.get(endpoint, headers=header, verify=False, timeout=60)
|
|
resp.raise_for_status()
|
|
maintenance_info = resp.json()
|
|
# Normalize tenants_scope
|
|
if isinstance(maintenance_info.get("tenants_scope"), str):
|
|
ts = maintenance_info.get("tenants_scope", "*").strip()
|
|
if ts == "" or ts == "*":
|
|
maintenance_info["tenants_scope"] = ["*"]
|
|
else:
|
|
maintenance_info["tenants_scope"] = [
|
|
s.strip() for s in ts.split(",") if s.strip()
|
|
]
|
|
elif not isinstance(maintenance_info.get("tenants_scope"), list):
|
|
maintenance_info["tenants_scope"] = ["*"]
|
|
except Exception as e:
|
|
log.error(
|
|
f"activity=maintenance_check, tenant_id={self.tenant_id}, decision=error, "
|
|
f"reason=maintenance_check_failed, exception={str(e)}"
|
|
)
|
|
maintenance_info = None
|
|
|
|
# Get bank holidays status
|
|
bank_holidays_info = None
|
|
try:
|
|
endpoint = f"{server_uri}/services/trackme/v2/bank_holidays/check_active"
|
|
resp = requests.get(endpoint, headers=header, verify=False, timeout=60)
|
|
resp.raise_for_status()
|
|
bank_holidays_info = resp.json()
|
|
log.info(
|
|
f"activity=bank_holidays_check, tenant_id={self.tenant_id}, decision=retrieved, "
|
|
f"reason=bank_holidays_status_retrieved, is_active={bank_holidays_info.get('payload', {}).get('is_active', False)}"
|
|
)
|
|
except Exception as e:
|
|
log.error(
|
|
f"activity=bank_holidays_check, tenant_id={self.tenant_id}, decision=error, "
|
|
f"reason=bank_holidays_check_failed, exception={str(e)}"
|
|
)
|
|
bank_holidays_info = None
|
|
|
|
# Build the simplified stateful alert search query
|
|
search_query = remove_leading_spaces(f"""\
|
|
search (`trackme_idx({self.tenant_id})` (sourcetype=trackme:state) tenant_id="{self.tenant_id}" object_category="*" monitored_state="enabled")
|
|
OR (`trackme_idx({self.tenant_id})` (sourcetype=trackme:flip) tenant_id="{self.tenant_id}" object_category="*" object_previous_state!="discovered")
|
|
OR (`trackme_idx({self.tenant_id})` (sourcetype=trackme:sla_breaches) tenant_id="{self.tenant_id}" object_category="*")
|
|
OR (`trackme_notable_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category=*)
|
|
""")
|
|
|
|
# Search parameters
|
|
search_kwargs = {
|
|
"earliest_time": earliest,
|
|
"latest_time": latest,
|
|
"count": 0,
|
|
"output_mode": "json",
|
|
}
|
|
|
|
log.info(
|
|
f"activity=initialization, tenant_id={self.tenant_id}, decision=start, reason=trackmestateful_command_started"
|
|
)
|
|
|
|
# Create a minimal helper-like object for filtering functions
|
|
class MinimalHelper:
|
|
def log_info(self, msg):
|
|
log.info(msg)
|
|
|
|
def log_debug(self, msg):
|
|
log.debug(msg)
|
|
|
|
def log_error(self, msg):
|
|
log.error(msg)
|
|
|
|
def log_warning(self, msg):
|
|
log.warning(msg)
|
|
|
|
minimal_helper = MinimalHelper()
|
|
|
|
# Initialize caches for performance optimization
|
|
# These caches avoid redundant KVstore queries and REST calls for the same object across multiple events
|
|
object_id_cache = {} # (object, object_category) -> object_id
|
|
object_state_cache = {} # object_id -> (object_state, anomaly_reason, status_message_json, monitored_state)
|
|
stateful_record_cache = {} # object_id -> stateful_record
|
|
validation_cache = {} # (object_id, alerting_states_tuple) -> ((is_valid, actual_object_state, error_message), timestamp)
|
|
score_cache = {} # object_id -> (score, score_definition)
|
|
|
|
log.info(
|
|
f"activity=initialization, tenant_id={self.tenant_id}, decision=caches_initialized, "
|
|
f"reason=performance_optimization_caches_created"
|
|
)
|
|
|
|
# Execute the search
|
|
try:
|
|
reader = run_splunk_search(
|
|
service, search_query, search_kwargs, 24, 5
|
|
)
|
|
|
|
events_processed = 0
|
|
events_yielded = 0
|
|
events_filtered = 0
|
|
events_passed_filters = 0 # Track events that passed all filters (before deduplication)
|
|
|
|
# Deduplication: Track latest event per (object_id, sourcetype) combination
|
|
# This prevents race conditions where multiple events for the same object
|
|
# in a single execution cause multiple updates (opened -> updated)
|
|
events_by_key = {} # key: (object_id, sourcetype) -> (yield_event, _time, context)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
events_processed += 1
|
|
|
|
# Get filtering context for this event
|
|
try:
|
|
event = item.get("_raw")
|
|
if not isinstance(event, dict):
|
|
if event is None:
|
|
raise ValueError("Event _raw field is None")
|
|
event = json.loads(event)
|
|
|
|
# get original Splunk metadata: index, sourcetype, host, source
|
|
index = item.get("index")
|
|
sourcetype = item.get("sourcetype")
|
|
host = item.get("host")
|
|
source = item.get("source")
|
|
|
|
yield_event = {
|
|
"_time": event.get("_time", time.time()),
|
|
"index": index,
|
|
"sourcetype": sourcetype,
|
|
"host": host,
|
|
"source": source,
|
|
"_raw": event,
|
|
}
|
|
for key, value in event.items():
|
|
yield_event[key] = value
|
|
context = get_event_filtering_context(
|
|
helper=minimal_helper,
|
|
service=service,
|
|
server_uri=server_uri,
|
|
header=header,
|
|
event=event,
|
|
maintenance_info=maintenance_info,
|
|
alerting_states=alerting_states,
|
|
ack_collection_keys=ack_collection_keys,
|
|
ack_collection_dict=ack_collection_dict,
|
|
bank_holidays_info=bank_holidays_info,
|
|
object_id_cache=object_id_cache,
|
|
object_state_cache=object_state_cache,
|
|
stateful_record_cache=stateful_record_cache,
|
|
)
|
|
|
|
# If context gathering failed, skip this event
|
|
if context is None:
|
|
events_filtered += 1
|
|
log.info(
|
|
f"activity=context_gathering, tenant_id={event.get('tenant_id')}, object={event.get('object')}, "
|
|
f"decision=skip, reason=context_gathering_failed"
|
|
)
|
|
continue
|
|
|
|
# Pre-filter: Check if event meets basic conditions for stateful backend
|
|
should_prefilter_yield, prefilter_reason = should_prefilter_yield_event(
|
|
event=event,
|
|
context=context,
|
|
alerting_states=alerting_states,
|
|
sourcetype=sourcetype,
|
|
)
|
|
|
|
if not should_prefilter_yield:
|
|
events_filtered += 1
|
|
log.info(
|
|
f"activity=pre_filter, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, "
|
|
f"decision=skip, reason={prefilter_reason if prefilter_reason else 'unknown_reason'}"
|
|
)
|
|
continue
|
|
|
|
# Event passed pre-filter, now apply full filtering logic
|
|
# (ack checks, maintenance checks, validation, etc.)
|
|
should_process, skip_reason = filter_stateful_alert_event(
|
|
helper=minimal_helper,
|
|
server_uri=server_uri,
|
|
header=header,
|
|
event=event,
|
|
context=context,
|
|
alerting_states=alerting_states,
|
|
validation_cache=validation_cache,
|
|
sourcetype=sourcetype,
|
|
)
|
|
|
|
if should_process:
|
|
# Event passed all filters - add to deduplication dict
|
|
events_passed_filters += 1
|
|
|
|
# Use (object_id, sourcetype) as key to deduplicate within this execution
|
|
object_id = context.get('object_id')
|
|
dedup_key = (object_id, sourcetype)
|
|
# Convert _time to float for consistent comparison (handles string epoch timestamps)
|
|
event_time = float(yield_event.get("_time", time.time()))
|
|
|
|
# Keep only the latest event per (object_id, sourcetype) combination
|
|
if dedup_key not in events_by_key:
|
|
events_by_key[dedup_key] = (yield_event, event_time, context, prefilter_reason)
|
|
log.debug(
|
|
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={object_id}, sourcetype={sourcetype}, decision=added, "
|
|
f"reason=first_event_for_object_sourcetype_combination, event_time={event_time}"
|
|
)
|
|
else:
|
|
existing_event, existing_time, existing_context, existing_reason = events_by_key[dedup_key]
|
|
# Ensure existing_time is also float for safe comparison
|
|
if not isinstance(existing_time, (int, float)):
|
|
existing_time = float(existing_time)
|
|
if event_time > existing_time:
|
|
# This event is newer, replace the existing one
|
|
events_by_key[dedup_key] = (yield_event, event_time, context, prefilter_reason)
|
|
log.debug(
|
|
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={object_id}, sourcetype={sourcetype}, decision=replaced, "
|
|
f"reason=newer_event_found, old_time={existing_time}, new_time={event_time}"
|
|
)
|
|
else:
|
|
log.debug(
|
|
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={object_id}, sourcetype={sourcetype}, decision=skipped, "
|
|
f"reason=older_event_ignored, existing_time={existing_time}, event_time={event_time}"
|
|
)
|
|
|
|
log.info(
|
|
f"activity=filtering, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, "
|
|
f"decision=yield, reason=passed_pre_filter_and_full_filter, prefilter_reason={prefilter_reason}"
|
|
)
|
|
else:
|
|
events_filtered += 1
|
|
log.info(
|
|
f"activity=filtering, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={context.get('object_id')}, object_state={context.get('object_state')}, "
|
|
f"decision=skip, reason={skip_reason if skip_reason else 'unknown_reason'}"
|
|
)
|
|
|
|
except Exception as e:
|
|
events_filtered += 1
|
|
# Safely extract event info for logging (event might not be a dict if JSON parsing failed)
|
|
tenant_id_info = "unknown"
|
|
object_info = "unknown"
|
|
try:
|
|
if isinstance(event, dict):
|
|
tenant_id_info = event.get('tenant_id', 'unknown')
|
|
object_info = event.get('object', 'unknown')
|
|
except Exception:
|
|
pass
|
|
log.error(
|
|
f"activity=event_processing, tenant_id={tenant_id_info}, object={object_info}, "
|
|
f"decision=error, reason=exception_during_event_processing, exception={str(e)}"
|
|
)
|
|
# On error, we skip the event (fail-safe behavior)
|
|
continue
|
|
|
|
# Deduplication: Extract only the latest event per (object_id, sourcetype) combination
|
|
# This prevents race conditions where multiple events for the same object in a single execution
|
|
# would cause multiple updates (opened -> updated) in the backend
|
|
events_yielded_records = []
|
|
events_yielded = len(events_by_key) # Count deduplicated events
|
|
|
|
for dedup_key, (yield_event, event_time, context, prefilter_reason) in events_by_key.items():
|
|
object_id, sourcetype = dedup_key
|
|
|
|
# Load score and score_definition from component data via REST call
|
|
tenant_id = context.get('tenant_id')
|
|
component = context.get('component')
|
|
|
|
# Check cache first to avoid redundant REST calls
|
|
score = None
|
|
score_definition = None
|
|
if object_id in score_cache:
|
|
score, score_definition = score_cache[object_id]
|
|
log.debug(
|
|
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
|
|
f"decision=cache_hit, score={score}"
|
|
)
|
|
elif tenant_id and component and object_id:
|
|
try:
|
|
# Build the REST endpoint URL
|
|
url = f"{server_uri}/services/trackme/v2/component/load_component_data"
|
|
|
|
# Prepare query parameters
|
|
params = {
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"pagination_mode": "local",
|
|
"filter_key": object_id,
|
|
}
|
|
|
|
log.debug(
|
|
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
|
|
f"url={url}, params={params}"
|
|
)
|
|
|
|
# Make the REST call
|
|
response = requests.get(
|
|
url,
|
|
headers=header,
|
|
verify=False,
|
|
params=params,
|
|
timeout=30,
|
|
)
|
|
|
|
# Check response status
|
|
if response.status_code == 200:
|
|
try:
|
|
response_json = response.json()
|
|
|
|
# Handle different response structures:
|
|
# - If pagination_mode="local": response is {"payload": [records...], "status": 200}
|
|
# Splunk REST framework may unwrap to just [records...]
|
|
# - If pagination_mode="remote": response is {"payload": {"data": [records...], "last_page": ...}, "status": 200}
|
|
|
|
# Check if response is wrapped in "payload"
|
|
if isinstance(response_json, dict) and "payload" in response_json:
|
|
payload = response_json["payload"]
|
|
# Check if payload has "data" key (remote pagination) or is a list (local pagination)
|
|
if isinstance(payload, dict) and "data" in payload:
|
|
data = payload["data"]
|
|
elif isinstance(payload, list):
|
|
data = payload
|
|
else:
|
|
data = []
|
|
# Check if response is a list directly (local pagination, unwrapped)
|
|
elif isinstance(response_json, list):
|
|
data = response_json
|
|
# Check if response has "data" key directly
|
|
elif isinstance(response_json, dict) and "data" in response_json:
|
|
data = response_json["data"]
|
|
else:
|
|
data = []
|
|
|
|
if data and len(data) > 0:
|
|
# Get the first record (should be only one when filtering by object_id)
|
|
record = data[0]
|
|
|
|
# Extract score and score_definition
|
|
score_raw = record.get("score")
|
|
score_definition = record.get("score_definition")
|
|
|
|
# Convert score to integer (score is always an integer)
|
|
score = None
|
|
if score_raw is not None:
|
|
try:
|
|
score = int(float(score_raw)) # Convert to float first to handle string "100.0", then to int
|
|
except (ValueError, TypeError):
|
|
log.warning(
|
|
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
|
|
f"decision=error, reason=failed_to_convert_score_to_int, score_raw={score_raw}"
|
|
)
|
|
score = None
|
|
|
|
# Cache the results
|
|
score_cache[object_id] = (score, score_definition)
|
|
|
|
log.debug(
|
|
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
|
|
f"score={score}, decision=cached"
|
|
)
|
|
else:
|
|
# Cache None values when data is empty to avoid redundant REST calls
|
|
# This handles the case where REST call succeeds but no records found for object_id
|
|
score_cache[object_id] = (None, None)
|
|
log.debug(
|
|
f"activity=load_score, tenant_id={tenant_id}, object_id={object_id}, "
|
|
f"decision=cached_empty, reason=no_data_returned_from_rest_call"
|
|
)
|
|
except (json.JSONDecodeError, KeyError, IndexError, TypeError) as e:
|
|
log.warning(
|
|
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
|
|
f"decision=error, reason=failed_to_parse_response, exception={str(e)}"
|
|
)
|
|
# Cache None values to avoid repeated failed calls
|
|
score_cache[object_id] = (None, None)
|
|
else:
|
|
log.warning(
|
|
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
|
|
f"decision=error, reason=rest_call_failed, status_code={response.status_code}, "
|
|
f"response_text={response.text[:200]}"
|
|
)
|
|
# Cache None values to avoid repeated failed calls
|
|
score_cache[object_id] = (None, None)
|
|
except Exception as e:
|
|
log.warning(
|
|
f"activity=load_score, tenant_id={tenant_id}, component={component}, object_id={object_id}, "
|
|
f"decision=error, reason=exception_during_rest_call, exception={str(e)}"
|
|
)
|
|
# Cache None values to avoid repeated failed calls
|
|
score_cache[object_id] = (None, None)
|
|
|
|
# Add to yield_event if present
|
|
if score is not None:
|
|
yield_event["score"] = score
|
|
if score_definition is not None:
|
|
yield_event["score_definition"] = score_definition
|
|
|
|
events_yielded_records.append(yield_event)
|
|
|
|
log.debug(
|
|
f"activity=deduplication, tenant_id={context.get('tenant_id')}, object={context.get('object')}, "
|
|
f"object_id={object_id}, sourcetype={sourcetype}, decision=final_yield, "
|
|
f"reason=latest_event_selected_for_object_sourcetype_combination, event_time={event_time}"
|
|
)
|
|
|
|
# Log deduplication summary if duplicates were found
|
|
if len(events_by_key) < events_passed_filters:
|
|
duplicates_removed = events_passed_filters - len(events_by_key)
|
|
log.info(
|
|
f"activity=deduplication, tenant_id={self.tenant_id}, decision=completed, "
|
|
f"reason=duplicate_events_deduplicated, events_passed_filters={events_passed_filters}, "
|
|
f"events_after_dedup={len(events_by_key)}, duplicates_removed={duplicates_removed}"
|
|
)
|
|
|
|
# yield events (now deduplicated)
|
|
for yield_record in generate_fields(events_yielded_records):
|
|
yield yield_record
|
|
|
|
log.info(
|
|
f"activity=completion, tenant_id={self.tenant_id}, decision=terminated, "
|
|
f"reason=trackmestateful_command_completed, run_time_seconds={round(time.time() - start, 3)}, "
|
|
f"events_processed={events_processed}, events_yielded={events_yielded}, events_filtered={events_filtered}, "
|
|
f"cache_stats=object_id_cache_size={len(object_id_cache)}, object_state_cache_size={len(object_state_cache)}, "
|
|
f"stateful_record_cache_size={len(stateful_record_cache)}, validation_cache_size={len(validation_cache)}, "
|
|
f"score_cache_size={len(score_cache)}"
|
|
)
|
|
|
|
except Exception as e:
|
|
log.error(
|
|
f"activity=execution, tenant_id={self.tenant_id}, decision=error, "
|
|
f"reason=trackmestateful_command_failed, exception={str(e)}"
|
|
)
|
|
# Raise exception for main search failures - don't yield error events
|
|
# Error events with missing required fields (object, object_category) would crash the downstream backend
|
|
raise Exception(
|
|
f"trackmestateful command failed for tenant_id={self.tenant_id}: {str(e)}"
|
|
) from e
|
|
|
|
|
|
dispatch(TrackMeStateful, sys.argv, sys.stdin, sys.stdout, __name__)
|