You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/lib/trackme_libs_decisionmaker.py

6460 lines
292 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import os
import sys
import time
import logging
import ast
import json
import re
import operator
# Networking and URL handling imports
from urllib.parse import urlencode
import urllib3
# Disable insecure request warnings for urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# append lib
sys.path.append(os.path.join(splunkhome, "etc", "apps", "trackme", "lib"))
# import trackme libs
from trackme_libs import run_splunk_search
# Import trackme libs logicalgroup
from trackme_libs_logicalgroup import logical_group_update_green_red_members
# Import trackme libs utils
from trackme_libs_utils import strict_interpret_boolean, remove_leading_spaces
# Import trackme libs disruption queue
from trackme_libs_disruption_queue import (
disruption_queue_lookup,
disruption_queue_update,
disruption_queue_get_duration,
)
# Import trackme libs splk flx
from trackme_libs_splk_flx import normalize_flx_tracker_name
# Import collections data for default values
from collections_data import vtenant_account_default
# logging:
# To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition
# and rely on callers themselves
def get_anomaly_reason_from_component_type(component_type):
"""
Map score_definition component type to anomaly_reason value.
Args:
component_type: The type from score_definition.components (e.g., "future_tolerance_breach")
Returns:
The corresponding anomaly_reason value (e.g., "future_over_tolerance")
"""
mapping = {
"future_tolerance_breach": "future_over_tolerance",
"data_sampling_anomaly": "data_sampling_anomaly",
"delay_threshold_breach": "delay_threshold_breached",
"lag_threshold_breach": "lag_threshold_breached",
"latency_threshold_breach": "lag_threshold_breached",
"min_hosts_dcount_breach": "min_hosts_dcount",
"metric_alert": "metric_alert",
"inactive": "inactive",
"status_not_met": "status_not_met",
"skipping_searches": "skipping_searches",
"execution_errors": "execution_errors",
"orphan_search": "orphan_search",
"execution_delayed": "execution_delayed",
"out_of_monitoring_times": "out_of_monitoring_times",
"ml_outliers_detection": "ml_outliers_detection",
"manual_score": "score_breached",
"score_breached": "score_breached",
}
return mapping.get(component_type, component_type)
def get_impact_score(vtenant_account, field_name, default_value):
"""
Helper function to get impact score from vtenant_account with fallback to default.
Args:
vtenant_account: Dictionary containing virtual tenant account configuration
field_name: Name of the impact score field to retrieve
default_value: Default value to use if field is not found
Returns:
Integer impact score value
"""
if vtenant_account and isinstance(vtenant_account, dict):
value = vtenant_account.get(field_name)
if value is not None:
try:
return int(value)
except (ValueError, TypeError):
pass
# Fallback to vtenant_account_default if available
default = vtenant_account_default.get(field_name, default_value)
try:
return int(default)
except (ValueError, TypeError):
return default_value
def get_entity_impact_score(record, component, score_type, vtenant_account, default_value):
"""
Helper function to get impact score with entity-level override support.
Checks entity-level impact_score_weights first, then falls back to tenant-level configuration.
Args:
record: Dictionary containing entity record data (may contain impact_score_weights)
component: Component type ('dsm' or 'dhm')
score_type: Score type ('delay' or 'latency')
vtenant_account: Dictionary containing virtual tenant account configuration
default_value: Default value to use if no override is found
Returns:
Integer impact score value
"""
# First, check for entity-level custom impact score weights
if record and isinstance(record, dict):
impact_score_weights = record.get("impact_score_weights")
if impact_score_weights:
try:
# Parse JSON if it's a string
if isinstance(impact_score_weights, str):
weights_dict = json.loads(impact_score_weights)
elif isinstance(impact_score_weights, dict):
weights_dict = impact_score_weights
else:
weights_dict = None
# Check if we have a custom weight for this score type
if weights_dict and isinstance(weights_dict, dict):
custom_weight = weights_dict.get(score_type)
if custom_weight is not None:
try:
return int(custom_weight)
except (ValueError, TypeError):
pass
except (json.JSONDecodeError, AttributeError, TypeError):
# If parsing fails, fall through to tenant-level
pass
# Fall back to tenant-level configuration
field_name = f"impact_score_{component}_{score_type}_threshold_breach"
return get_impact_score(vtenant_account, field_name, default_value)
def parse_filters(query_parameters):
filters = []
for key, value in query_parameters.items():
if "filter[" in key:
parts = key.split("[")
index = int(parts[1].split("]")[0])
prop = parts[2].split("]")[0]
# Initialize filter dict if it doesn't exist
while len(filters) <= index:
filters.append({})
if "value" in key and len(parts) > 3:
# Handle list values
if "value" not in filters[index]:
filters[index]["value"] = []
# Convert list index (e.g., filter[0][value][0]) to int and insert value
list_index = int(parts[3].split("]")[0])
# Ensure the list is long enough to hold this index
while len(filters[index]["value"]) <= list_index:
filters[index]["value"].append(None)
filters[index]["value"][list_index] = (
value.lower() if isinstance(value, str) else value
)
else:
# Handle non-list values or the property itself
filters[index][prop] = (
value.lower() if isinstance(value, str) else value
)
return filters
def record_matches_filter(record, filter):
field = filter.get("field")
filter_type = filter.get("type")
value = filter.get("value")
# Immediately return True if the filter value is empty
if value == "":
return True
# Try to interpret the value as a JSON list if it looks like one
if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
try:
value = json.loads(value)
except json.JSONDecodeError:
pass # If decoding fails, proceed with the original value string
# Prepare the record value for comparison
record_value = record.get(field, "")
if (
isinstance(record_value, str)
and "|" in record_value
and field not in ["alias", "object"]
):
# Treat as a pseudo list if record_value contains pipes
record_value = [item.strip().lower() for item in record_value.split("|")]
elif isinstance(record_value, str):
record_value = record_value.strip().lower()
if isinstance(value, str):
value = value.strip().lower()
# Handling for different filter types when record_value is a list
if isinstance(record_value, list):
if filter_type == "like":
if isinstance(value, list):
return any(v.lower() in item for item in record_value for v in value)
else:
return any(value in item for item in record_value)
elif filter_type == "=":
if isinstance(value, list):
return any(item == v.lower() for item in record_value for v in value)
else:
return value in record_value
# we can accept != as a filter
elif filter_type == "!=":
if isinstance(value, list):
return any(item != v.lower() for item in record_value for v in value)
else:
return value not in record_value
elif filter_type in ("<", "<=", ">", ">="):
# Numerical comparisons for lists are more complex and context-dependent
# You might want to reconsider how these should behave with list record_values
return False
elif filter_type == "in":
if isinstance(value, list):
return any(item in [v.lower() for v in value] for item in record_value)
else:
return value in record_value
elif filter_type == "starts":
return any(item.startswith(value) for item in record_value)
elif filter_type == "ends":
return any(item.endswith(value) for item in record_value)
elif filter_type == "regex":
return any(re.search(value, item) is not None for item in record_value)
else:
# Handling for different filter types when record_value is a string
if filter_type == "like":
return value in record_value
elif filter_type == "=":
return record_value == value
# numerical comparison (except for != which is handled as a string comparison)
elif filter_type in ("<", "<=", ">", ">=", "!="):
# Handle numerical comparisons including "!="
if filter_type == "!=":
try:
# Attempt numerical comparison first
is_not_equal = float(record_value) != float(value)
except ValueError:
# Fallback to string comparison
is_not_equal = record_value != value
return is_not_equal
else:
try:
record_value = float(record_value)
value = float(value)
except ValueError:
return False # Skip filter if conversion fails
# Directly evaluate the comparison expression
return eval(f"record_value {filter_type} value")
elif filter_type == "in":
if isinstance(value, list):
return record_value in [v.lower() for v in value]
else:
return record_value == value
elif filter_type == "starts":
return record_value.startswith(value)
elif filter_type == "ends":
return record_value.endswith(value)
elif filter_type == "regex":
return re.search(value, record_value) is not None
return False
def pre_filter_records(data_records, query_parameters):
"""
Pre-filters records based on a subset of filter fields: 'alias', 'object', 'monitored_state'.
If other fields are present in the filters, returns all records without filtering.
"""
filters = parse_filters(query_parameters)
prefilter_fields = {"alias", "object", "monitored_state"}
# Check if any filter exists outside the pre-defined fields for pre-filtering
if any(f.get("field") not in prefilter_fields for f in filters):
# If there are filters on fields outside the pre-filter scope, return all records
return data_records
# Proceed with pre-filtering if all filters fall within the pre-filter scope
prefiltered_records = [
record
for record in data_records
if all(record_matches_filter(record, f) for f in filters)
]
return prefiltered_records
def filter_records(data_records, query_parameters):
"""
Filters data records based on structured filters parsed from query parameters.
"""
filters = parse_filters(query_parameters)
if len(filters) > 0:
logging.debug(f'filters="{filters}"')
# Apply filters to records, requiring all conditions to be met ('AND' logic)
filtered_records = [
record
for record in data_records
if all(record_matches_filter(record, f) for f in filters)
]
return filtered_records
def convert_seconds_to_duration(seconds):
"""
Define the function convert_seconds_to_duration
behaviour: converts seconds to duration, duration is a string from as [D+]HH:MM:SS
The first segment represents the number of days, the second the number of hours, third the number of minutes, and the fourth the number of seconds.
"""
try:
original_seconds = int(seconds)
except ValueError:
return 0
# Check if the original seconds were negative
is_negative = original_seconds < 0
seconds = abs(original_seconds)
# Calculate days, hours, minutes, and seconds
days = seconds // (24 * 3600)
seconds = seconds % (24 * 3600)
hours = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
# Format the duration string
if days > 0:
duration = f"{days}+{hours:02d}:{minutes:02d}:{seconds:02d}"
else:
duration = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
# Add "-" if the original seconds were negative
if is_negative:
duration = "-" + duration
return duration
def convert_epoch_to_datetime(epoch):
"""
Define the function convert_epoch_to_datetime
"""
# convert epoch to float
try:
epoch = float(epoch)
# convert epoch to datetime
datetime = time.strftime("%d %b %Y %H:%M", time.localtime(epoch))
return datetime
except Exception as e:
epoch = 0
return epoch
def get_monitoring_time_status(monitoring_time_policy, monitoring_time_rules):
"""
Determine if an entity is currently under monitoring based on monitoring_time_policy and monitoring_time_rules.
Arguments:
- monitoring_time_policy: predefined policy name (string/list) or dictionary format
- monitoring_time_rules: dictionary with week day keys (0-6) and hour lists as values
Returns:
- (isUnderMonitoring, anomaly_reason, status_message) tuple
- isUnderMonitoring: True if entity is currently under monitoring, False otherwise
- anomaly_reason: "out_of_monitoring_times" if not under monitoring, None otherwise
- status_message: Human-readable message describing the monitoring status
"""
try:
import json
# Helper function to convert day number to day name
def get_day_name(day_no):
day_names = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
return day_names[day_no] if 0 <= day_no <= 6 else f"Day {day_no}"
# Helper function to format time in human-readable format
def format_time(hour_decimal):
hour_int = int(hour_decimal)
minutes = int((hour_decimal - hour_int) * 60)
if minutes == 0:
return f"{hour_int:02d}:00"
else:
return f"{hour_int:02d}:{minutes:02d}"
# get current wday (0=Sunday, 6=Saturday)
current_wday_no = int(time.strftime("%w"))
current_day_name = get_day_name(current_wday_no)
# get current hour and minute for precise checking
current_hour = int(time.strftime("%H"))
current_minute = int(time.strftime("%M"))
current_hour_decimal = current_hour + (current_minute / 60.0)
current_time_str = format_time(current_hour_decimal)
# Priority: monitoring_time_rules > monitoring_time_policy > all_time
# Check monitoring_time_rules first (takes precedence)
if monitoring_time_rules is not None and monitoring_time_rules != "":
try:
# Parse if it's a string
if isinstance(monitoring_time_rules, str):
rules_dict = json.loads(monitoring_time_rules)
else:
rules_dict = monitoring_time_rules
if isinstance(rules_dict, dict) and len(rules_dict) > 0:
# Check if current day is in the rules
day_key = str(current_wday_no)
if day_key in rules_dict:
hours_list = rules_dict[day_key]
if isinstance(hours_list, list) and len(hours_list) > 0:
# Check if current hour is in the list
for hour_val in hours_list:
try:
hour_float = float(hour_val)
# Check if current hour matches (within the hour range)
if hour_float <= current_hour_decimal < hour_float + 1:
return (
True,
None,
f"This entity is currently under monitoring (custom rules: {current_day_name} {current_time_str})"
)
except (ValueError, TypeError):
continue
# Current day/hour not in rules
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (custom rules: {current_day_name} {current_time_str} is not within the configured monitoring schedule)"
)
except Exception as e:
logging.warning(f"Failed to parse monitoring_time_rules: {str(e)}, falling back to policy")
# Check monitoring_time_policy
if monitoring_time_policy is not None and monitoring_time_policy != "":
try:
# Parse if it's a string
if isinstance(monitoring_time_policy, str):
# Try to parse as JSON first (might be dictionary)
try:
policy_dict = json.loads(monitoring_time_policy)
except (json.JSONDecodeError, ValueError):
# Not JSON, treat as predefined policy name
policy_dict = None
policy_name = monitoring_time_policy
elif isinstance(monitoring_time_policy, list):
# List of policy names - use first one
policy_name = monitoring_time_policy[0] if len(monitoring_time_policy) > 0 else None
policy_dict = None
elif isinstance(monitoring_time_policy, dict):
policy_dict = monitoring_time_policy
policy_name = None
else:
policy_dict = None
policy_name = None
# If dictionary format, use it like monitoring_time_rules
if policy_dict is not None and isinstance(policy_dict, dict) and len(policy_dict) > 0:
day_key = str(current_wday_no)
if day_key in policy_dict:
hours_list = policy_dict[day_key]
if isinstance(hours_list, list) and len(hours_list) > 0:
for hour_val in hours_list:
try:
hour_float = float(hour_val)
if hour_float <= current_hour_decimal < hour_float + 1:
return (
True,
None,
f"This entity is currently under monitoring (custom policy: {current_day_name} {current_time_str})"
)
except (ValueError, TypeError):
continue
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (custom policy: {current_day_name} {current_time_str} is not within the configured monitoring schedule)"
)
# Map predefined policy names to day+hour rules
if policy_name:
if policy_name == "all_time":
return (True, None, "This entity is currently under monitoring (all_time policy)")
elif policy_name == "business_days_all_hours":
# Monday-Friday (1-5), all hours
if current_wday_no in [1, 2, 3, 4, 5]:
return (True, None, f"This entity is currently under monitoring (business_days_all_hours policy: {current_day_name})")
else:
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (business_days_all_hours policy: {current_day_name} is not a business day)"
)
elif policy_name == "monday_saturday_all_hours":
# Monday-Saturday (1-6), all hours
if current_wday_no in [1, 2, 3, 4, 5, 6]:
return (True, None, f"This entity is currently under monitoring (monday_saturday_all_hours policy: {current_day_name})")
else:
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (monday_saturday_all_hours policy: {current_day_name} is not within Monday-Saturday)"
)
elif policy_name == "business_days_08h_20h":
# Monday-Friday (1-5), 8:00-20:00
if current_wday_no in [1, 2, 3, 4, 5]:
if 8 <= current_hour < 20:
return (True, None, f"This entity is currently under monitoring (business_days_08h_20h policy: {current_day_name} {current_time_str})")
else:
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (business_days_08h_20h policy: {current_day_name} {current_time_str} is outside 08:00-20:00 range)"
)
else:
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (business_days_08h_20h policy: {current_day_name} is not a business day)"
)
elif policy_name == "monday_saturday_08h_20h":
# Monday-Saturday (1-6), 8:00-20:00
if current_wday_no in [1, 2, 3, 4, 5, 6]:
if 8 <= current_hour < 20:
return (True, None, f"This entity is currently under monitoring (monday_saturday_08h_20h policy: {current_day_name} {current_time_str})")
else:
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (monday_saturday_08h_20h policy: {current_day_name} {current_time_str} is outside 08:00-20:00 range)"
)
else:
return (
False,
"out_of_monitoring_times",
f"This entity is not currently under monitoring (monday_saturday_08h_20h policy: {current_day_name} is not within Monday-Saturday)"
)
except Exception as e:
logging.warning(f"Failed to parse monitoring_time_policy: {str(e)}, falling back to all_time")
# Final fallback: all_time (monitor always)
return (True, None, "This entity is currently under monitoring (all_time policy)")
except Exception as e:
logging.error(f"get_monitoring_time_status function has failed, exception={str(e)}")
# Fallback to all_time on error
return (True, None, f"Monitoring time status check failed: {str(e)}, defaulting to all_time monitoring")
def get_outliers_status(isOutlier, OutliersDisabled, tenant_outliers_set_state=None, score_outliers=None):
"""
Create a function called get_outliers_status:
- arguments: isOutlier, OutliersDisabled, tenant_outliers_set_state (deprecated, kept for backward compatibility), score_outliers
- tenant_outliers_set_state: Deprecated - no longer used with score-based approach. Kept for backward compatibility.
- score_outliers: The score_outliers value from calculate_score (optional, for hybrid scoring)
- behaviour: alter isOutlier, 0=not outlier, 1=outlier (can turn red), 2=outlier but disabled/score too low
- With score-based approach, score_outliers controls whether outliers can turn entities red (score >= 100)
"""
if OutliersDisabled == 1:
isOutlier = 0
else:
if isOutlier == 1:
# Score-based approach: if score_outliers is provided and < 100, don't allow outlier to turn red
if score_outliers is not None:
if score_outliers >= 100:
isOutlier = 1 # Allow outlier status to turn entity red
else:
isOutlier = 2 # Report outlier but don't allow it to turn entity red
else:
isOutlier = 1 # Legacy behavior: allow outlier status if score not available
else:
isOutlier = 0
return isOutlier
def get_data_sampling_status(
data_sample_status_colour, data_sample_feature, tenant_data_sampling_set_state=None
):
"""
Create a function called get_data_sampling_status:
- arguments: data_sample_status_colour, data_sample_feature, tenant_data_sampling_set_state (deprecated, kept for backward compatibility)
- tenant_data_sampling_set_state: Deprecated - no longer used with score-based approach. Kept for backward compatibility.
- behaviour: alter isAnomaly, 0=not anomaly, 1=anomaly, 2=anomaly but disabled at tenant level
- With score-based approach, the impact score controls whether sampling anomalies affect entity status
"""
# if disabled at entity level, them isAnomaly is 0
if data_sample_feature == "disabled":
isAnomaly = 0
if data_sample_status_colour == "green":
isAnomaly = 0
elif data_sample_status_colour == "red":
# With score-based approach, score controls the impact, so always allow anomaly
# The score will determine if entity turns red (score >= 100)
isAnomaly = 1
else:
isAnomaly = 0
return isAnomaly
def get_future_status(
future_tolerance,
system_future_tolerance,
data_last_lag_seen,
data_last_ingestion_lag_seen,
data_last_time_seen,
data_last_ingest,
):
"""
Create a function called get_future_status:
- arguments: future_tolerance (expressed in seconds), system_future_tolerance (expressed in seconds), data_last_lag_seen (expressed in seconds)
- behaviour: returns a boolean, True if data_last_lag_seen is lower than future_tolerance, False otherwise. If future_tolerance is 0, rely on system_future_tolerance
"""
isFuture = False
isFutureMsg = ""
if future_tolerance == 0:
future_tolerance = system_future_tolerance
# convert all to int
try:
future_tolerance = int(round(float(future_tolerance), 0))
data_last_lag_seen = int(round(float(data_last_lag_seen), 0))
data_last_ingestion_lag_seen = int(
round(float(data_last_ingestion_lag_seen), 0)
)
except:
pass
logging.debug(
f"data_last_lag_seen={data_last_lag_seen}, system_future_tolerance={system_future_tolerance}, future_tolerance={future_tolerance}"
)
if float(data_last_lag_seen) < float(future_tolerance) or float(
data_last_ingestion_lag_seen
) < float(future_tolerance):
isFuture = True
# convert data_last_lag_seen to duration
data_last_lag_seen_duration = convert_seconds_to_duration(data_last_lag_seen)
# convert data_last_ingestion_lag_seen to duration
data_last_ingestion_lag_seen_duration = convert_seconds_to_duration(
data_last_ingestion_lag_seen
)
# convert data_last_time_seen to %c
data_last_time_seen_datetime = convert_epoch_to_datetime(data_last_time_seen)
# convert data_last_ingest to %c
data_last_ingest_datetime = convert_epoch_to_datetime(data_last_ingest)
isFutureMsg = f"""detected data indexed in the future which is most likely due to timestamping misconfiguration, timezone or time synchronization issue. Event delay is {data_last_lag_seen} seconds (duration: {data_last_lag_seen_duration}), Event latency is {data_last_ingestion_lag_seen} seconds (duration: {data_last_ingestion_lag_seen_duration}), this is beyond current tolerance threshold of {future_tolerance} seconds, latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event ingested for this entity: {data_last_ingest_datetime}. Review and fix the root cause, or adapt the future tolerance at the system level or for this entity especially."""
else:
isFuture = False
return isFuture, isFutureMsg, future_tolerance
def get_future_metrics_status(
system_future_tolerance,
metric_last_time_seen,
):
"""
Create a function called get_future_metrics_status:
- arguments: future_tolerance (expressed in seconds), system_future_tolerance (expressed in seconds), data_last_lag_seen (expressed in seconds)
- behaviour: returns a boolean, True if metric_last_time_seen is lower than system_future_tolerance, False otherwise.
"""
isFuture = False
isFutureMsg = ""
logging.debug(
f"metric_last_time_seen={metric_last_time_seen}, system_future_tolerance={system_future_tolerance}"
)
if float(metric_last_time_seen) < float(system_future_tolerance):
isFuture = True
# convert data_last_lag_seen to duration
metric_last_time_seen_duration = convert_seconds_to_duration(
metric_last_time_seen
)
# convert data_last_time_seen to %c
metric_last_time_seen_datetime = convert_epoch_to_datetime(
metric_last_time_seen
)
isFutureMsg = f"""detected metrics indexed in the future which is most likely due to timestamping misconfiguration, timezone or time synchronization issue. Metric delay is {metric_last_time_seen} seconds (duration: {metric_last_time_seen_duration}) which is beyond tolerance threshold of {system_future_tolerance}, latest event available (_time) for this entity: {metric_last_time_seen_datetime}. Review and fix the root cause, or adapt the future tolerance at the system level or for this entity especially"""
else:
isFuture = False
return isFuture, isFutureMsg
def get_is_under_dcount_host(min_dcount_host, min_dcount_threshold, min_dcount_field):
"""
Create a function call get_is_under_dcount_host:
- arguments: min_dcount_host, min_dcount_threshold
- returns: isUnderDcountHost (boolean), isUnderDcountHostMsg (string)
- behaviour: returns a boolean, True if min_dcount_threshold is a numerical and is lower than min_dcount_host, False otherwise
"""
if isinstance(min_dcount_host, float) and min_dcount_threshold < min_dcount_host:
isUnderDcountHost = True
isUnderDcountHostMsg = f"""Monitoring conditions are not met due to low number of hosts. Number of hosts is {int(min_dcount_threshold)} based on the metric {min_dcount_field} which is lower than the minimum required number of hosts of {int(min_dcount_host)}"""
else:
isUnderDcountHost = False
isUnderDcountHostMsg = ""
return isUnderDcountHost, isUnderDcountHostMsg
def get_logical_groups_collection_records(collection):
"""
Queries and processes records from a collection based on specific criteria.
:param collection: The collection object to query.
:return: Tuple containing collection records and a dictionary of records.
"""
collection_records = []
collection_records_dict = {}
count_to_process_list = []
collection_members_list = []
collection_members_dict = {}
end = False
skip_tracker = 0
while not end:
process_collection_records = collection.data.query(skip=skip_tracker)
if process_collection_records:
for item in process_collection_records:
collection_records.append(item)
collection_records_dict[item.get("_key")] = {
"object_group_name": item.get("object_group_name"),
"object_group_mtime": item.get("object_group_mtime"),
"object_group_members": item.get("object_group_members", []),
"object_group_members_green": item.get(
"object_group_members_green", []
),
"object_group_members_red": item.get(
"object_group_members_red", []
),
"object_group_min_green_percent": item.get(
"object_group_min_green_percent", 0
),
}
try:
logicalgroup_members = item.get("object_group_members", [])
# add members in collection_members_list, also create a dict per member
for member in logicalgroup_members:
if member not in collection_members_list:
collection_members_list.append(member)
collection_members_dict[member] = {
"object_group_key": item.get("_key"),
"object_group_name": item.get("object_group_name"),
"object_group_members": item.get(
"object_group_members", []
),
"object_group_members_green": item.get(
"object_group_members_green", []
),
"object_group_members_red": item.get(
"object_group_members_red", []
),
"object_group_min_green_percent": item.get(
"object_group_min_green_percent", 0
),
}
except Exception as e:
logging.error(
f"function get_logical_groups_collection_records, error while processing logical group members, exception={str(e)}"
)
count_to_process_list.append(item.get("_key"))
skip_tracker += 5000
else:
end = True
return (
collection_records,
collection_records_dict,
collection_members_list,
collection_members_dict,
count_to_process_list,
)
def get_and_manage_logical_group_status(
splunkd_uri,
session_key,
tenant_id,
object_name,
object_state,
object_group_key,
object_logical_group_dict,
):
"""
Create a function called get_and_manage_logical_group_status:
- arguments: object_name, object_state, object_group_key, object_logical_group_dict
- returns: isUnderLogicalGroup (boolean), LogicalGroupStateInAlert (boolean), LogicalGroupMsg (string)
- behaviour:
isUnderLogicalGroup: True if object_group_key is not empty and object_group_members_count is higher than 1
LogicalGroupStateInAlert: True if object_group_green_percent is lower than object_group_min_green_percent
LogicalGroupMsg: string containing the status of the logical group
"""
# object_group_members_count
object_group_members_count = 0
# get logical group name
object_group_name = object_logical_group_dict.get("object_group_name", "")
try:
# enter if the group is not empty
if object_group_name != "":
object_group_min_green_percent = object_logical_group_dict.get(
"object_group_min_green_percent", 0
)
object_group_members = object_logical_group_dict.get(
"object_group_members", []
)
try:
object_group_members_count = len(object_group_members)
except:
object_group_members_count = 0
# if not a list and is a string, convert to list
if isinstance(object_group_members, str):
object_group_members = [object_group_members]
object_group_members_green = object_logical_group_dict.get(
"object_group_members_green", []
)
# if not a list and is a string, convert to list
if isinstance(object_group_members_green, str):
object_group_members_green = [object_group_members_green]
object_group_members_red = object_logical_group_dict.get(
"object_group_members_red", []
)
# if not a list and is a string, convert to list
if isinstance(object_group_members_red, str):
object_group_members_red = [object_group_members_red]
# if object_state is green, object_name must be in object_group_members_green but not in object_group_members_red
# if object_state is red or blue, object_name must be in object_group_members_red but not in object_group_members_green
# any change required to object_group_members_green and object_group_members_red implies an update to the KVstore record is required, set the boolean uppdate_kvstore_record to True if required
update_kvstore_record = False
if object_state == "green":
if object_name not in object_group_members_green:
object_group_members_green.append(object_name)
update_kvstore_record = True
if object_name in object_group_members_red:
object_group_members_red.remove(object_name)
update_kvstore_record = True
else:
if object_name not in object_group_members_red:
object_group_members_red.append(object_name)
update_kvstore_record = True
if object_name in object_group_members_green:
object_group_members_green.remove(object_name)
update_kvstore_record = True
# if update_kvstore_record is True, call the API endpoint accordingly
if update_kvstore_record:
# proceed
try:
response = logical_group_update_green_red_members(
splunkd_uri,
session_key,
tenant_id,
object_name,
object_group_key,
object_group_members_green,
object_group_members_red,
)
logging.info(
f'tenant="{tenant_id}", object="{object_name}", logical group green/red members update API was successfull, response="{response}"'
)
except Exception as e:
logging.error(
f'tenant="{tenant_id}", object="{object_name}", logical group green/red members update API call has failed, exception="{str(e)}"'
)
# ensure object_group_min_green_percent is float
try:
object_group_min_green_percent = float(object_group_min_green_percent)
except:
object_group_min_green_percent = 0
# calculate object_group_green_percent, if logical group is empty, then object_group_green_percent is 100
try:
if object_group_members_count > 0:
object_group_green_percent = (
len(object_group_members_green) / object_group_members_count
) * 100
else:
object_group_green_percent = 100
except:
object_group_green_percent = 0
# define status and return
if object_group_key != "" and object_group_members_count > 1:
isUnderLogicalGroup = True
if object_group_green_percent < object_group_min_green_percent:
LogicalGroupStateInAlert = True
LogicalGroupMsg = f"""Logical Group {object_group_name} with key="{object_group_key}" is in alert state. The current green percentage of the group is {round(object_group_green_percent, 2)}% which is lower than the minimum green percentage of {round(object_group_min_green_percent, 2)}%, object_group_members_count={object_group_members_count}, object_group_members_red={object_group_members_red}"""
else:
LogicalGroupStateInAlert = False
LogicalGroupMsg = f"""Logical Group {object_group_name} with key="{object_group_key}" is in normal state. The current green percentage of the group is {round(object_group_green_percent, 2)}% which is higher or equal to the minimal green percentage of {round(object_group_min_green_percent, 2)}%, object_group_members_count={object_group_members_count}, object_group_members_red={object_group_members_red}"""
else:
isUnderLogicalGroup = False
LogicalGroupStateInAlert = False
LogicalGroupMsg = ""
return isUnderLogicalGroup, LogicalGroupStateInAlert, LogicalGroupMsg
except Exception as e:
logging.error(
f'function get_and_manage_logical_group_status has failed, exception="{str(e)}", object_name="{object_name}", object_group_key="{object_group_key}", object_logical_group_dict="{object_logical_group_dict}"'
)
return (
False,
False,
f'function get_and_manage_logical_group_status has failed, exception="{str(e)}", object_name="{object_name}", object_group_key="{object_group_key}"',
)
def get_dsm_latency_status(
data_last_ingestion_lag_seen,
data_max_lag_allowed,
data_last_ingest,
data_last_time_seen,
):
"""
Create a function called get_dsm_latency_status:
- arguments: data_last_ingestion_lag_seen, data_max_lag_allowed, data_last_ingest, data_last_time_seen
- returns: isUnderLatencyAlert (boolean), isUnderLatencyMessage (string)
- behaviour:
isUnderLatencyAlert: If data_last_ingestion_lag_seen is higher than data_max_lag_allowed, then isUnderLatencyAlert is True
isUnderLatencyMessage: If isUnderLatencyAlert is True:
"Monitoring conditions are not met due to latency issues. Ingestion latency is $data_last_ingestion_lag_seen$ seconds (duration: <conversion of data_last_ingestion_lag_seen to duration), which is higher than the maximum allowed latency of $data_max_lag_allowed$ seconds (duration: <conversion of data_max_lag_allowed to duration>), latest event available for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
isUnderLatencyMessage: If isUnderLatencyAlert is False:
"Monitoring conditions are not met for ingest latency are met. Ingestion latency is $data_last_ingestion_lag_seen$ seconds (duration: <conversion of data_last_ingestion_lag_seen to duration), which is lower than the maximum allowed latency of $data_max_lag_allowed$ seconds (duration: <conversion of data_max_lag_allowed to duration>), latest event available for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
"""
# convert data_last_ingestion_lag_seen to float
try:
data_last_ingestion_lag_seen = float(data_last_ingestion_lag_seen)
except:
data_last_ingestion_lag_seen = 0
# convert data_max_lag_allowed to float
try:
data_max_lag_allowed = float(data_max_lag_allowed)
except:
data_max_lag_allowed = 0
# convert data_last_ingest to float
try:
data_last_ingest = float(data_last_ingest)
except:
data_last_ingest = 0
# convert data_last_time_seen to float
try:
data_last_time_seen = float(data_last_time_seen)
except:
data_last_time_seen = 0
# convert data_last_ingestion_lag_seen to duration
data_last_ingestion_lag_seen_duration = convert_seconds_to_duration(
data_last_ingestion_lag_seen
)
# convert data_max_lag_allowed to duration
data_max_lag_allowed_duration = convert_seconds_to_duration(data_max_lag_allowed)
# convert data_last_ingest to %c
data_last_ingest_datetime = convert_epoch_to_datetime(data_last_ingest)
# calculate the time since last ingestion in seconds
time_since_last_ingestion = time.time() - data_last_ingest
# convert data_last_time_seen to %c
data_last_time_seen_datetime = convert_epoch_to_datetime(data_last_time_seen)
# calculate the time since last event in seconds
time_since_last_event = time.time() - data_last_time_seen
# define isUnderLatencyAlert
if float(data_last_ingestion_lag_seen) > float(data_max_lag_allowed):
isUnderLatencyAlert = True
else:
isUnderLatencyAlert = False
# define isUnderLatencyMessage
if isUnderLatencyAlert:
# if the time since last ingestion and the time since last event are less than data_max_lag_allowed, then indicate that might be receiving a mix of delayed and non-delayed events
if (
time_since_last_ingestion < data_max_lag_allowed
and time_since_last_event < data_max_lag_allowed
):
isUnderLatencyMessage = f"""Monitoring conditions are not met due to latency issues. Ingestion latency is approximately {round(float(data_last_ingestion_lag_seen), 3)} seconds (duration: {data_last_ingestion_lag_seen_duration}), which is higher than the maximum allowed latency of {int(data_max_lag_allowed)} seconds (duration: {data_max_lag_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event indexed (_indextime) for this entity: {data_last_ingest_datetime}, this indicates that the source might be receiving a mix of delayed and non-delayed events"""
else:
isUnderLatencyMessage = f"""Monitoring conditions are not met due to latency issues. Ingestion latency is approximately {round(float(data_last_ingestion_lag_seen), 3)} seconds (duration: {data_last_ingestion_lag_seen_duration}), which is higher than the maximum allowed latency of {int(data_max_lag_allowed)} seconds (duration: {data_max_lag_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event indexed (_indextime) for this entity: {data_last_ingest_datetime}, this indicates that the source is receiving delayed events only"""
else:
isUnderLatencyMessage = f"""monitoring conditions for ingest latency are met. Ingestion latency is approximately {round(float(data_last_ingestion_lag_seen), 3)} seconds (duration: {data_last_ingestion_lag_seen_duration}), which is lower than the maximum allowed latency of {int(data_max_lag_allowed)} seconds (duration: {data_max_lag_allowed_duration}), latest event indexed (_indextime) for this entity: {data_last_ingest_datetime}"""
# return
return isUnderLatencyAlert, isUnderLatencyMessage
def get_dsm_delay_status(
data_last_lag_seen,
data_max_delay_allowed,
data_last_ingest,
data_last_time_seen,
):
"""
Create a function called get_dsm_delay_status:
- arguments: data_last_lag_seen, data_max_delay_allowed, data_last_ingest, data_last_time_seen
- returns: isUnderDelayAlert (boolean), isUnderDelayMessage (string)
- behaviour:
isUnderDelayAlert: If data_last_lag_seen is higher than data_max_delay_allowed, then isUnderDelayAlert is True
isUnderDelayMessage: If isUnderDelayAlert is True:
"Monitoring conditions are not met due to delay issues. Event delay is $data_last_lag_seen$ seconds (duration: <conversion of data_last_lag_seen to duration), which is higher than the maximum allowed delay of $data_max_delay_allowed$ seconds (duration: <conversion of data_max_delay_allowed to duration>), latest event available (_time) for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
isUnderDelayMessage: If isUnderDelayAlert is False:
"monitoring conditions for event delay are met. Event delay is $data_last_lag_seen$ seconds (duration: <conversion of data_last_lag_seen to duration), which is lower than the maximum allowed delay of $data_max_delay_allowed$ seconds (duration: <conversion of data_max_delay_allowed to duration>), latest event available for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
"""
# convert data_last_lag_seen to float
try:
data_last_lag_seen = float(data_last_lag_seen)
except:
data_last_lag_seen = 0
# convert data_max_delay_allowed to float
try:
data_max_delay_allowed = float(data_max_delay_allowed)
except:
data_max_delay_allowed = 0
# convert data_last_ingest to float
try:
data_last_ingest = float(data_last_ingest)
except:
data_last_ingest = 0
# convert data_last_time_seen to float
try:
data_last_time_seen = float(data_last_time_seen)
except:
data_last_time_seen = 0
# convert data_last_lag_seen to duration
data_last_lag_seen_duration = convert_seconds_to_duration(data_last_lag_seen)
# convert data_max_delay_allowed to duration
data_max_delay_allowed_duration = convert_seconds_to_duration(
data_max_delay_allowed
)
# convert data_last_ingest to %c
data_last_ingest_datetime = convert_epoch_to_datetime(data_last_ingest)
# convert data_last_time_seen to %c
data_last_time_seen_datetime = convert_epoch_to_datetime(data_last_time_seen)
# define isUnderDelayAlert
if float(data_last_lag_seen) > float(data_max_delay_allowed):
isUnderDelayAlert = True
else:
isUnderDelayAlert = False
# define isUnderDelayMessage
if isUnderDelayAlert:
isUnderDelayMessage = f"""Monitoring conditions are not met due to delay issues. Event delay is {round(float(data_last_lag_seen), 3)} seconds (duration: {data_last_lag_seen_duration}), which is higher than the maximum allowed delay of {int(round(float(data_max_delay_allowed), 0))} seconds (duration: {data_max_delay_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event ingested (_indextime) for this entity: {data_last_ingest_datetime}. This incidates that the source is receiving events with timestamps older than the threshold defined for this entity."""
else:
isUnderDelayMessage = f"""monitoring conditions for event delay are met. Event delay is {round(float(data_last_lag_seen), 3)} seconds (duration: {data_last_lag_seen_duration}), which is lower than the maximum allowed delay of {int(round(float(data_max_delay_allowed), 0))} seconds (duration: {data_max_delay_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}"""
# return
return isUnderDelayAlert, isUnderDelayMessage
def set_dsm_status(
logger,
splunkd_uri,
session_key,
tenant_id,
record,
isOutlier,
isAnomaly,
isFuture,
isFutureMsg,
isUnderMonitoring,
isUnderMonitoringMsg,
isUnderDcountHost,
isUnderDcountHostMsg,
object_logical_group_dict,
isUnderLatencyAlert,
isUnderLatencyMessage,
isUnderDelayAlert,
isUnderDelayMessage,
disruption_queue_collection,
disruption_queue_record,
source_handler=None,
monitoring_anomaly_reason=None,
score=None,
score_outliers=None,
vtenant_account=None,
):
"""
Create a function called set_dsm_status:
- arguments: record, isOutlier, isAnomaly, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderDcountHost, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
- returns:
object_state (string): blue, orange, green, red
anomaly_reason (list): list of short code reasons why the object is in anomaly
status_message (list): list of long description reasons why the object is in anomaly
- behaviour:
object_state:
green if:
isOutlier is 1
isAnomaly is 1
isFuture is False
isUnderMonitoring is True
isUnderDcountHost is False
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
isUnderLatencyAlert is False
isUnderDelayAlert is False
blue if:
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
orange if:
All green conditions are met except for isFuture which would be True
red if:
Any of the green conditions are not met, and blue conditions and orange conditions are not met
anomaly_reason:
if object_state is green, anomnaly_reason is None
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
"""
# init status_message and anomaly_reason
status_message = []
anomaly_reason = []
# init status_message_json
status_message_json = {}
# init original_object_state
original_object_state = record.get("object_state", "green")
# define object_state
# Check outliers: if isOutlier == 1 but score_outliers <= 0, treat as no outlier (suppressed)
isOutlierEffective = isOutlier == 1
if score_outliers is not None and score_outliers <= 0:
# Outliers are suppressed (false positive), don't treat as outlier
isOutlierEffective = False
if (
(isOutlierEffective == False or isOutlier == 2)
and (isAnomaly == 0 or isAnomaly == 2)
and isUnderDcountHost is False
and isUnderLatencyAlert is False
and isUnderDelayAlert is False
):
object_state = "green"
else:
object_state = "red"
#
# Logical group management
#
(
isUnderLogicalGroup,
LogicalGroupStateInAlert,
LogicalGroupMsg,
) = get_and_manage_logical_group_status(
splunkd_uri,
session_key,
tenant_id,
record.get("object"),
object_state,
record.get("object_group_key"),
object_logical_group_dict,
)
# log debug
logging.debug(
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
)
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
if object_state == "red" and isUnderLogicalGroup is True:
if LogicalGroupStateInAlert is False:
object_state = "blue"
# if object_state is not red or blue but isFuture is True, then object_state is orange
if object_state not in ["red", "blue"]:
if isFuture is True:
object_state = "orange"
# if object_state is red but isUnderMonitoring is False, then object_state is orange
if object_state == "red":
if isUnderMonitoring is False:
object_state = "orange"
#
# Hybrid scoring: Apply score-based logic
# Outliers are handled separately via score_outliers in get_outliers_status
#
total_score = None
score_definition = {}
if score is not None:
# Calculate total score with static increments for anomalies
base_score = float(score) if score is not None else 0.0
total_score = base_score
# Build score definition to track where the score comes from
# Convert base_score to integer if it's a whole number, otherwise keep as float
if base_score == int(base_score):
score_definition["base_score"] = int(base_score)
else:
score_definition["base_score"] = base_score
score_definition["components"] = []
# Add static increments for each anomaly type (using VT-specific impact scores)
if isAnomaly == 1:
increment = get_impact_score(vtenant_account, "impact_score_dsm_data_sampling_anomaly", 36)
total_score += increment
score_definition["components"].append({
"type": "data_sampling_anomaly",
"score": increment,
"description": "Data sampling anomaly detected"
})
if isUnderDelayAlert is True:
increment = get_entity_impact_score(record, "dsm", "delay", vtenant_account, 100)
total_score += increment
score_definition["components"].append({
"type": "delay_threshold_breach",
"score": increment,
"description": "Delay threshold breached"
})
if isUnderLatencyAlert is True:
increment = get_entity_impact_score(record, "dsm", "latency", vtenant_account, 48)
total_score += increment
score_definition["components"].append({
"type": "latency_threshold_breach",
"score": increment,
"description": "Latency threshold breached"
})
if isUnderDcountHost is True:
increment = get_impact_score(vtenant_account, "impact_score_dsm_min_hosts_dcount_breach", 100)
total_score += increment
score_definition["components"].append({
"type": "min_hosts_dcount_breach",
"score": increment,
"description": "Minimum hosts dcount threshold breached"
})
if isFuture is True:
increment = get_impact_score(vtenant_account, "impact_score_dsm_future_tolerance_breach", 36)
total_score += increment
score_definition["components"].append({
"type": "future_tolerance_breach",
"score": increment,
"description": "Future tolerance breached"
})
# Add outlier score if present
if score_outliers is not None and score_outliers > 0:
score_definition["score_outliers"] = float(score_outliers)
# Add score sources if available
score_source = record.get("score_source", [])
if score_source:
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
# Check for manual_score increases (positive scores from manual_score source)
# If manual_score increases the score without a related anomaly, add score_breached component
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
if "manual_score" in score_source_list and total_score and total_score > 0:
# Check if there are no other anomaly components
has_other_anomalies = (
isAnomaly == 1
or isUnderDelayAlert is True
or isUnderLatencyAlert is True
or isUnderDcountHost is True
or isFuture is True
or (score_outliers is not None and score_outliers > 0)
)
if not has_other_anomalies:
# Manual score increase without other anomalies - add score_breached component
score_definition["components"].append({
"type": "manual_score",
"score": 0, # Score is already included in base_score calculation
"description": "Manual score influence applied without related anomaly"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Apply score-based logic:
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
# - If total_score == 0: keep current state
if total_score >= 100:
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
if object_state not in ["red", "blue"]:
object_state = "red"
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red (score >= 100)'
)
else:
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
)
elif total_score > 0 and total_score < 100:
# If score > 0 and < 100, entity should be orange (even if currently green)
if object_state == "green":
object_state = "orange"
# Add status message about score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
# Add outlier context if outliers are present
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
)
elif object_state == "red":
# Downgrade red to orange if score < 100
# Only apply score-based downgrade if the red state is NOT due to outliers
# (outliers with score_outliers >= 100 should still be red)
if isOutlier != 1:
object_state = "orange"
# Add status message about score when downgrading
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
)
else:
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
# in get_outliers_status, so we can still apply score-based logic
if score_outliers is not None and score_outliers < 100:
object_state = "orange"
# Add status message about score when downgrading due to low outlier score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'downgrading red to orange (outlier score too low)'
)
else:
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
)
else:
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
)
else:
# total_score == 0 or total_score <= 0
# Check if score is 0 due to false_positive (global false positive, not just outliers)
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive:
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
object_state = "green"
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_source="{score_source}", '
f'setting state to green (false positive set, score cancelled)'
)
elif score_outliers is not None and score_outliers <= 0:
# Check if there are any other issues
has_other_issues = (
(isAnomaly == 1)
or isUnderDcountHost is True
or isUnderLatencyAlert is True
or isUnderDelayAlert is True
)
if not has_other_issues:
# Outliers are suppressed (false positive), and no other issues, set to green
object_state = "green"
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'setting state to green (outliers suppressed, no other issues)'
)
else:
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'keeping current state (score == 0, but other issues present)'
)
else:
logging.debug(
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'keeping current state (score == 0)'
)
# define anomaly_reason
if object_state == "green":
status_message.append(isUnderDelayMessage)
status_message.append(isUnderLatencyMessage)
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive and score_definition and "components" in score_definition:
# Extract anomaly reasons from score_definition components
for component in score_definition.get("components", []):
component_type = component.get("type")
if component_type:
mapped_reason = get_anomaly_reason_from_component_type(component_type)
if mapped_reason and mapped_reason not in anomaly_reason:
anomaly_reason.append(mapped_reason)
# If no components found, still add "none"
if not anomaly_reason:
anomaly_reason.append("none")
else:
anomaly_reason.append("none")
# if in a logical group, add the logical group message
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
else:
# Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
outlier_reasons = record.get("isOutlierReason", [])
if outlier_reasons:
if isinstance(outlier_reasons, list):
# Join the list elements into a single string
outlier_reasons_str = " | ".join(outlier_reasons)
status_message.append(outlier_reasons_str)
else:
# If it's not a list, append it directly
status_message.append(outlier_reasons)
# Add ml_outliers_detection to anomaly_reason for all outlier cases
if "ml_outliers_detection" not in anomaly_reason:
anomaly_reason.append("ml_outliers_detection")
# Add status message for orange state (score_outliers > 0 and < 100)
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
base_score = float(score) if score is not None else 0.0
status_message.append(
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
if isAnomaly == 1:
status_message.append(
"anomalies detected in the data sampling and format recognition, review the data sampling screen to investigate. This alert means that trackMe detected an issue in the format of the events compared to the format that was previously identified for this source"
)
anomaly_reason.append("data_sampling_anomaly")
if isFuture is True:
status_message.append(isFutureMsg)
anomaly_reason.append("future_over_tolerance")
# Monitoring time policy, add the message first then the anomaly reason
if isUnderMonitoring is False:
status_message.append(isUnderMonitoringMsg)
# Use new monitoring anomaly reason if provided, otherwise use legacy separate reasons
if monitoring_anomaly_reason:
anomaly_reason.append(monitoring_anomaly_reason)
if isUnderDcountHost is True:
status_message.append(isUnderDcountHostMsg)
anomaly_reason.append("min_hosts_dcount")
if isUnderLatencyAlert is True:
status_message.append(isUnderLatencyMessage)
anomaly_reason.append("lag_threshold_breached")
if isUnderDelayAlert is True:
status_message.append(isUnderDelayMessage)
anomaly_reason.append("delay_threshold_breached")
# logical group
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
anomaly_reason.append("in_logical_group")
# form status_message_json
status_message_json["status_message"] = status_message
status_message_json["anomaly_reason"] = anomaly_reason
# Add score information to status_message_json for UI display (sorted alphabetically)
# Use total_score if calculated (hybrid scoring), otherwise use base score
if total_score is not None:
status_message_json["score"] = float(total_score)
# Update record score to reflect the calculated total_score for UI consistency
record["score"] = float(total_score)
# Add score definition for drilldown modal
if score_definition:
status_message_json["score_definition"] = score_definition
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
elif score is not None:
status_message_json["score"] = float(score)
if score_outliers is not None:
status_message_json["score_outliers"] = float(score_outliers)
if total_score is not None:
status_message_json["total_score"] = float(total_score)
# get disruption_duration
if not disruption_queue_record:
record["disruption_min_time_sec"] = 0
else:
logger.debug(
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
)
disruption_object_state = disruption_queue_record.get("object_state", "green")
try:
disruption_min_time_sec = int(
disruption_queue_record.get("disruption_min_time_sec", 0)
)
except:
disruption_min_time_sec = 0
# add to the record
record["disruption_min_time_sec"] = disruption_min_time_sec
try:
disruption_start_epoch = float(
disruption_queue_record.get("disruption_start_epoch", 0)
)
except:
disruption_start_epoch = 0
# Case 1: Entity is no longer in alert state (not red)
if object_state != "red":
# Only update if we were previously tracking a disruption
if disruption_object_state == "red":
disruption_queue_record["object_state"] = object_state
disruption_queue_record["disruption_start_epoch"] = 0
disruption_queue_record["mtime"] = time.time()
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
return object_state, status_message, status_message_json, anomaly_reason
# Case 2: Entity is in alert state (red)
if object_state == "red":
current_time = time.time()
# If this is a new disruption, start tracking it
if disruption_object_state != "red":
disruption_queue_record["object_state"] = "red"
disruption_queue_record["disruption_start_epoch"] = current_time
disruption_queue_record["mtime"] = current_time
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
# For new disruptions, if min time is set, show as blue with message
if disruption_min_time_sec > 0:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
return object_state, status_message, status_message_json, anomaly_reason
# If we're already tracking a disruption, check duration
if disruption_min_time_sec > 0:
try:
disruption_duration = current_time - disruption_start_epoch
except Exception as e:
logger.error(f"error calculating disruption_duration: {e}")
disruption_duration = 0
# If duration hasn't breached threshold, show as blue with message
if disruption_duration < disruption_min_time_sec:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# return
logging.debug(
f'set_dsm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
return (
object_state,
status_message,
status_message_json,
anomaly_reason,
)
def set_dhm_status(
logger,
splunkd_uri,
session_key,
tenant_id,
record,
isOutlier,
isFuture,
isFutureMsg,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
isUnderLatencyAlert,
isUnderLatencyMessage,
isUnderDelayAlert,
isUnderDelayMessage,
default_splk_dhm_alerting_policy,
disruption_queue_collection,
disruption_queue_record,
source_handler=None,
monitoring_anomaly_reason=None,
score=None,
score_outliers=None,
vtenant_account=None,
):
"""
Create a function called set_dhm_status:
- arguments: record, isOutlier, isFuture, isFutureMsg, isUnderMonitoring, isUnderMonitoringMsg, object_logical_group_dict, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage, default_splk_dhm_alerting_policy
- returns:
object_state (string): blue, orange, green, red
anomaly_reason (list): list of short code reasons why the object is in anomaly
status_message (list): list of long description reasons why the object is in anomaly
- behaviour:
object_state:
green if:
isOutlier is 1
isFuture is False
isUnderMonitoring is True
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
isUnderLatencyAlert is False
isUnderDelayAlert is False
blue if:
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
orange if:
All green conditions are met except for isFuture which would be True
red if:
Any of the green conditions are not met, and blue conditions and orange conditions are not met
anomaly_reason:
if object_state is green, anomnaly_reason is None
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
"""
# init status_message and anomaly_reason
status_message = []
anomaly_reason = []
# init status_message_json
status_message_json = {}
# define object_state
# Check outliers: if isOutlier == 1 but score_outliers <= 0, treat as no outlier (suppressed)
isOutlierEffective = isOutlier == 1
if score_outliers is not None and score_outliers <= 0:
# Outliers are suppressed (false positive), don't treat as outlier
isOutlierEffective = False
if (
(isOutlierEffective == False or isOutlier == 2)
and isUnderLatencyAlert is False
and isUnderDelayAlert is False
):
object_state = "green"
else:
object_state = "red"
#
# Logical group management
#
(
isUnderLogicalGroup,
LogicalGroupStateInAlert,
LogicalGroupMsg,
) = get_and_manage_logical_group_status(
splunkd_uri,
session_key,
tenant_id,
record.get("object"),
object_state,
record.get("object_group_key"),
object_logical_group_dict,
)
# log debug
logging.debug(
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
)
splk_dhm_alerting_policy = record.get("splk_dhm_alerting_policy", "global_policy")
if not len(splk_dhm_alerting_policy) > 0:
splk_dhm_alerting_policy = "global_policy"
splk_dhm_st_summary = record.get("splk_dhm_st_summary")
# get the entity global max delay allowed
global_max_delay_allowed = int(
round(float(record.get("data_max_delay_allowed", 0)), 0)
)
# get the entity global max lag allowed
global_max_lag_allowed = int(round(float(record.get("data_max_lag_allowed", 0)), 0))
# get the entity global last delay seen (data_last_lag_seen)
global_last_event_lag = int(round(float(record.get("data_last_lag_seen", 0)), 0))
# get the entity global last lag seen (data_last_ingestion_lag_seen)
global_last_ingest_lag = int(
round(float(record.get("data_last_ingestion_lag_seen", 0)), 0)
)
# Convert splk_dhm_st_summary to a list if it is a string
if isinstance(splk_dhm_st_summary, str):
splk_dhm_st_summary = [splk_dhm_st_summary]
# counters
count_red = 0
count_green = 0
sourcetypes_red_list = []
# retrieve host_idx_blocklists, host_st_blocklists
host_idx_blocklists = record.get("host_idx_blocklists", [])
host_st_blocklists = record.get("host_st_blocklists", [])
# if string, then turn into list from comma separated string
if isinstance(host_idx_blocklists, str):
host_idx_blocklists = host_idx_blocklists.split(",")
if isinstance(host_st_blocklists, str):
host_st_blocklists = host_st_blocklists.split(",")
# splk_dhm_st_summary can actually be a list
if isinstance(splk_dhm_st_summary, list):
for item_str in splk_dhm_st_summary:
dict_str = "{" + item_str + "}"
dict_loaded = False
dict_loading_error = []
# try both options
try:
new_dict = ast.literal_eval(dict_str)
dict_loaded = True
except Exception as e:
dict_loaded = False
dict_loading_error.append(str(e))
if not dict_loaded:
try:
new_dict = json.loads(item_str)
dict_loaded = True
except Exception as e:
dict_loaded = False
dict_loading_error.append(str(e))
if dict_loaded:
# handle blocklists
new_dict = {
key: val
for key, val in new_dict.items()
if val["idx"] not in host_idx_blocklists
and val["st"] not in host_st_blocklists
}
# Iterate through the inner dictionaries
for inner_dict in new_dict.values():
if inner_dict.get("state") == "red":
count_red += 1
max_lag_allowed = float(inner_dict.get("max_lag_allowed"))
max_delay_allowed = float(inner_dict.get("max_delay_allowed"))
last_ingest_lag = float(inner_dict.get("last_ingest_lag"))
last_event_lag = float(inner_dict.get("last_event_lag"))
if (
last_ingest_lag > max_lag_allowed
and "lag_threshold_breached" not in anomaly_reason
):
anomaly_reason.append("lag_threshold_breached")
sourcetypes_red_list.append(
f'(idx: {inner_dict.get("idx")}, st: {inner_dict.get("st")}, anomaly_reason: lag_threshold_breached)'
)
if (
last_event_lag > max_delay_allowed
and "delay_threshold_breached" not in anomaly_reason
):
anomaly_reason.append("delay_threshold_breached")
sourcetypes_red_list.append(
f'(idx: {inner_dict.get("idx")}, st: {inner_dict.get("st")}, anomaly_reason: delay_threshold_breached)'
)
elif inner_dict.get("state") == "green":
count_green += 1
else:
logging.error(
f"Error in processing item_str: {item_str}. Error: {dict_loading_error}"
)
logging.debug(
f'object="{record.get("object")}", count_red={count_red}, count_green={count_green}'
)
# turn sourcetypes_red_list into a pipe separated string
sourcetypes_red_list = "|".join(sourcetypes_red_list)
# Decision making based on the counts of red and green states
if splk_dhm_alerting_policy == "global_policy":
if default_splk_dhm_alerting_policy == "track_per_host":
# Use object_state as it is
pass
elif default_splk_dhm_alerting_policy == "track_per_sourcetype":
if count_red > 0:
object_state = "red"
status_message.append(
f"One or more sourcetypes are in alert for this entity, and policy is set to track_per_sourcetype, sourcetypes in alert: {sourcetypes_red_list}"
)
else:
# Use object_state as it is
pass
elif splk_dhm_alerting_policy == "track_per_host":
# Use object_state as it is
pass
elif splk_dhm_alerting_policy == "track_per_sourcetype":
if count_red > 0:
object_state = "red"
status_message.append(
f"One or more sourcetypes are in alert for this entity, and policy is set to track_per_sourcetype, sourcetypes in alert: {sourcetypes_red_list}"
)
else:
# Use object_state as it is
pass
else:
# Use object_state as it is
pass
# if all sourcetypes are in alert, object_state is red or orange depending on the global max delay entity values
if (
count_green == 0
and (global_last_event_lag >= global_max_delay_allowed)
and (global_last_ingest_lag >= global_max_lag_allowed)
):
object_state = "red"
status_message.append(
f"all sourcetypes are in alert for this entity, global entity max delay allowed is breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)}), global entity max lag allowed is breached (max_delay_allowed: {global_max_lag_allowed} seconds, duration: {convert_seconds_to_duration(global_max_lag_allowed)}), last_event_lag: {global_last_ingest_lag} seconds, duration: {convert_seconds_to_duration(global_last_ingest_lag)})"
)
elif (
count_green == 0
and (global_last_event_lag < global_max_delay_allowed)
and (global_last_ingest_lag >= global_max_lag_allowed)
):
object_state = "red"
status_message.append(
f"all sourcetypes are in alert for this entity, global entity max delay allowed is not breached but max lag allowed is breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
)
elif (
count_green == 0
and (global_last_event_lag >= global_max_delay_allowed)
and (global_last_ingest_lag < global_max_lag_allowed)
):
object_state = "red"
status_message.append(
f"all sourcetypes are in alert for this entity, global entity max delay allowed is breached but max lag allowed is not breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
)
elif (
count_green == 0
and (global_last_event_lag < global_max_delay_allowed)
and (global_last_ingest_lag < global_max_lag_allowed)
):
object_state = "green"
status_message.append(
f"all sourcetypes are in alert for this entity, however global entity max delay allowed and max lag allowed are not breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
)
elif count_green == 0:
object_state = "red"
status_message.append(
f"all sourcetypes are in alert for this entity, however global entity level max delay allowed and max lag allowed could not be determined, verify TrackMe logs for more information (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
)
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
if object_state == "red" and isUnderLogicalGroup is True:
if LogicalGroupStateInAlert is False:
object_state = "blue"
# if object_state is not red or blue but isFuture is True, then object_state is orange
if object_state not in ["red", "blue"]:
if isFuture is True:
object_state = "orange"
# if object_state is red but if isUnderMonitoring is False, then object_state is orange
if object_state == "red":
if isUnderMonitoring is False:
object_state = "orange"
#
# Hybrid scoring: Apply score-based logic
# Outliers are handled separately via score_outliers in get_outliers_status
#
total_score = None
score_definition = {}
if score is not None:
# Calculate total score with static increments for anomalies
base_score = float(score) if score is not None else 0.0
total_score = base_score
# Build score definition to track where the score comes from
# Convert base_score to integer if it's a whole number, otherwise keep as float
if base_score == int(base_score):
score_definition["base_score"] = int(base_score)
else:
score_definition["base_score"] = base_score
score_definition["components"] = []
# Add static increments for each anomaly type (using VT-specific impact scores)
if isUnderDelayAlert is True:
increment = get_entity_impact_score(record, "dhm", "delay", vtenant_account, 100)
total_score += increment
score_definition["components"].append({
"type": "delay_threshold_breach",
"score": increment,
"description": "Delay threshold breached"
})
if isUnderLatencyAlert is True:
increment = get_entity_impact_score(record, "dhm", "latency", vtenant_account, 48)
total_score += increment
score_definition["components"].append({
"type": "latency_threshold_breach",
"score": increment,
"description": "Latency threshold breached"
})
if isFuture is True:
increment = get_impact_score(vtenant_account, "impact_score_dhm_future_tolerance_breach", 36)
total_score += increment
score_definition["components"].append({
"type": "future_tolerance_breach",
"score": increment,
"description": "Future tolerance breached"
})
# Add outlier score if present
if score_outliers is not None and score_outliers > 0:
score_definition["score_outliers"] = float(score_outliers)
# Add score sources if available
score_source = record.get("score_source", [])
if score_source:
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Apply score-based logic:
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
# - If total_score == 0: keep current state
if total_score >= 100:
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
if object_state not in ["red", "blue"]:
object_state = "red"
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red (score >= 100)'
)
else:
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
)
elif total_score > 0 and total_score < 100:
# If score > 0 and < 100, entity should be orange (even if currently green)
if object_state == "green":
object_state = "orange"
# Add status message about score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
# Add outlier context if outliers are present
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
)
elif object_state == "red":
# Downgrade red to orange if score < 100
# Only apply score-based downgrade if the red state is NOT due to outliers
# (outliers with score_outliers >= 100 should still be red)
if isOutlier != 1:
object_state = "orange"
# Add status message about score when downgrading
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
)
else:
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
# in get_outliers_status, so we can still apply score-based logic
if score_outliers is not None and score_outliers < 100:
object_state = "orange"
# Add status message about score when downgrading due to low outlier score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'downgrading red to orange (outlier score too low)'
)
else:
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
)
else:
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
)
else:
# total_score == 0 or total_score <= 0
# Check if score is 0 due to false_positive (global false positive, not just outliers)
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive:
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
object_state = "green"
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_source="{score_source}", '
f'setting state to green (false positive set, score cancelled)'
)
elif score_outliers is not None and score_outliers <= 0:
# Check if there are any other issues
has_other_issues = (
isUnderLatencyAlert is True
or isUnderDelayAlert is True
)
if not has_other_issues:
# Outliers are suppressed (false positive), and no other issues, set to green
object_state = "green"
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'setting state to green (outliers suppressed, no other issues)'
)
else:
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'keeping current state (score == 0, but other issues present)'
)
else:
logging.debug(
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'keeping current state (score == 0)'
)
# define anomaly_reason
if object_state == "green":
status_message.append(isUnderDelayMessage)
status_message.append(isUnderLatencyMessage)
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive and score_definition and "components" in score_definition:
# Extract anomaly reasons from score_definition components
for component in score_definition.get("components", []):
component_type = component.get("type")
if component_type:
mapped_reason = get_anomaly_reason_from_component_type(component_type)
if mapped_reason and mapped_reason not in anomaly_reason:
anomaly_reason.append(mapped_reason)
# If no components found, still add "none"
if not anomaly_reason:
anomaly_reason.append("none")
else:
anomaly_reason.append("none")
# if in a logical group, add the logical group message
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
else:
# Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
outlier_reasons = record.get("isOutlierReason", [])
if outlier_reasons:
if isinstance(outlier_reasons, list):
# Join the list elements into a single string
outlier_reasons_str = " | ".join(outlier_reasons)
status_message.append(outlier_reasons_str)
else:
# If it's not a list, append it directly
status_message.append(outlier_reasons)
# Add ml_outliers_detection to anomaly_reason for all outlier cases
if "ml_outliers_detection" not in anomaly_reason:
anomaly_reason.append("ml_outliers_detection")
# Add status message for orange state (score_outliers > 0 and < 100)
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
base_score = float(score) if score is not None else 0.0
status_message.append(
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
if isFuture is True:
status_message.append(isFutureMsg)
anomaly_reason.append("future_over_tolerance")
# Monitoring time policy, add the message first then the anomaly reason
if isUnderMonitoring is False:
status_message.append(isUnderMonitoringMsg)
# Use new monitoring anomaly reason if provided
if monitoring_anomaly_reason:
anomaly_reason.append(monitoring_anomaly_reason)
if isUnderLatencyAlert is True:
status_message.append(isUnderLatencyMessage)
anomaly_reason.append("lag_threshold_breached")
if isUnderDelayAlert is True:
status_message.append(isUnderDelayMessage)
anomaly_reason.append("delay_threshold_breached")
# logical group
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
anomaly_reason.append("in_logical_group")
# form status_message_json
status_message_json["status_message"] = status_message
# deduplicate anomaly_reason
anomaly_reason = list(set(anomaly_reason))
status_message_json["anomaly_reason"] = anomaly_reason
# Add score information to status_message_json for UI display (sorted alphabetically)
# Use total_score if calculated (hybrid scoring), otherwise use base score
if total_score is not None:
status_message_json["score"] = float(total_score)
# Update record score to reflect the calculated total_score for UI consistency
record["score"] = float(total_score)
# Add score definition for drilldown modal
if score_definition:
status_message_json["score_definition"] = score_definition
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
elif score is not None:
status_message_json["score"] = float(score)
if score_outliers is not None:
status_message_json["score_outliers"] = float(score_outliers)
if total_score is not None:
status_message_json["total_score"] = float(total_score)
# get disruption_duration
if not disruption_queue_record:
record["disruption_min_time_sec"] = 0
else:
logger.debug(
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
)
disruption_object_state = disruption_queue_record.get("object_state", "green")
try:
disruption_min_time_sec = int(
disruption_queue_record.get("disruption_min_time_sec", 0)
)
except:
disruption_min_time_sec = 0
# add to the record
record["disruption_min_time_sec"] = disruption_min_time_sec
try:
disruption_start_epoch = float(
disruption_queue_record.get("disruption_start_epoch", 0)
)
except:
disruption_start_epoch = 0
# Case 1: Entity is no longer in alert state (not red)
if object_state != "red":
# Only update if we were previously tracking a disruption
if disruption_object_state == "red":
disruption_queue_record["object_state"] = object_state
disruption_queue_record["disruption_start_epoch"] = 0
disruption_queue_record["mtime"] = time.time()
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
return (
object_state,
status_message,
status_message_json,
anomaly_reason,
splk_dhm_alerting_policy,
)
# Case 2: Entity is in alert state (red)
if object_state == "red":
current_time = time.time()
# If this is a new disruption, start tracking it
if disruption_object_state != "red":
disruption_queue_record["object_state"] = "red"
disruption_queue_record["disruption_start_epoch"] = current_time
disruption_queue_record["mtime"] = current_time
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
# For new disruptions, if min time is set, show as blue with message
if disruption_min_time_sec > 0:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
return (
object_state,
status_message,
status_message_json,
anomaly_reason,
splk_dhm_alerting_policy,
)
# If we're already tracking a disruption, check duration
if disruption_min_time_sec > 0:
try:
disruption_duration = current_time - disruption_start_epoch
except Exception as e:
logger.error(f"error calculating disruption_duration: {e}")
disruption_duration = 0
# If duration hasn't breached threshold, show as blue with message
if disruption_duration < disruption_min_time_sec:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# return
logging.debug(
f'set_dhm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
return (
object_state,
status_message,
status_message_json,
anomaly_reason,
splk_dhm_alerting_policy,
)
def set_mhm_status(
logger,
splunkd_uri,
session_key,
tenant_id,
record,
metric_details,
isFuture,
isFutureMsg,
object_logical_group_dict,
disruption_queue_collection,
disruption_queue_record,
source_handler=None,
score=None,
score_outliers=None,
vtenant_account=None,
):
"""
Create a function called set_mhm_status:
- arguments: record, isFuture, isFutureMsg, isUnderLogicalGroup, LogicalGroupStateInAlert
- returns:
object_state (string): blue, orange, green, red
anomaly_reason (list): list of short code reasons why the object is in anomaly
status_message (list): list of long description reasons why the object is in anomaly
- behaviour:
object_state:
green if:
all metric caterogies are green
blue if:
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
orange if:
All green conditions are met except for isFuture which would be True
red if:
Any of the green conditions are not met, and blue conditions and orange conditions are not met
anomaly_reason:
if object_state is green, anomnaly_reason is None
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
"""
# init status_message and anomaly_reason
status_message = []
anomaly_reason = []
# init status_message_json
status_message_json = {}
# define object_state
object_state = "green"
#
# Logical group management
#
(
isUnderLogicalGroup,
LogicalGroupStateInAlert,
LogicalGroupMsg,
) = get_and_manage_logical_group_status(
splunkd_uri,
session_key,
tenant_id,
record.get("object"),
object_state,
record.get("object_group_key"),
object_logical_group_dict,
)
# log debug
logging.debug(
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
)
# Convert metric_details to a list if it is a string
if isinstance(metric_details, str):
metric_details = [metric_details]
# counters
count_red = 0
count_green = 0
metrics_red_list = []
# splk_dhm_st_summary can actually be a list
if isinstance(metric_details, list):
for item_str in metric_details:
try:
new_dict = ast.literal_eval(item_str)
# Iterate through the inner dictionaries
for inner_dict in new_dict.values():
if inner_dict.get("state") == "red":
count_red += 1
anomaly_reason.append("delay_threshold_breached")
metrics_red_list.append(
f'(idx: {inner_dict.get("idx")}, metrics: {inner_dict.get("metric_category")}, anomaly_reason: delay_threshold_breached)'
)
elif inner_dict.get("state") == "green":
count_green += 1
except Exception as e:
logging.error(
f"Error in processing item_str: {item_str}. Error: {str(e)}"
)
logging.debug(
f'object="{record.get("object")}", count_red={count_red}, count_green={count_green}'
)
# turn metrics_red_list into a pipe separated string
metrics_red_list = "|".join(metrics_red_list)
# Decision making based on the counts of red and green states
if count_red > 0:
object_state = "red"
status_message.append(
f"One or more metric categories are in alert for this entity, metrics in alert: {metrics_red_list}"
)
else:
# Use object_state as it is
pass
# if all metrics are in alert, then object_state is red
if count_green == 0:
object_state = "red"
status_message.append("all metric categories are in alert for this entity")
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
if object_state == "red" and isUnderLogicalGroup is True:
if LogicalGroupStateInAlert is False:
object_state = "blue"
# if object_state is not red or blue but isFuture is True, then object_state is orange
if object_state not in ["red", "blue"]:
if isFuture is True:
object_state = "orange"
#
# Hybrid scoring: Apply score-based logic
# MHM doesn't have outliers, only future tolerance and metric alerts
#
total_score = None
score_definition = {}
if score is not None:
# Calculate total score with static increments for anomalies
base_score = float(score) if score is not None else 0.0
total_score = base_score
# Build score definition to track where the score comes from
# Convert base_score to integer if it's a whole number, otherwise keep as float
if base_score == int(base_score):
score_definition["base_score"] = int(base_score)
else:
score_definition["base_score"] = base_score
score_definition["components"] = []
# Add static increments for each anomaly type (using VT-specific impact scores)
if count_red > 0:
increment = get_impact_score(vtenant_account, "impact_score_mhm_metric_alert", 100)
total_score += increment
score_definition["components"].append({
"type": "metric_alert",
"score": increment,
"description": "One or more metric categories in alert"
})
if isFuture is True:
increment = get_impact_score(vtenant_account, "impact_score_mhm_future_tolerance_breach", 36)
total_score += increment
score_definition["components"].append({
"type": "future_tolerance_breach",
"score": increment,
"description": "Future tolerance breached"
})
# Add outlier score if present
if score_outliers is not None and score_outliers > 0:
score_definition["score_outliers"] = float(score_outliers)
# Add score sources if available
score_source = record.get("score_source", [])
if score_source:
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Apply score-based logic:
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
# - If total_score == 0: keep current state
if total_score >= 100:
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
if object_state not in ["red", "blue"]:
object_state = "red"
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red (score >= 100)'
)
else:
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
)
elif total_score > 0 and total_score < 100:
# If score > 0 and < 100, entity should be orange (even if currently green)
if object_state == "green":
object_state = "orange"
# Add status message about score
status_message.append(
f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
)
elif object_state == "red":
# Downgrade red to orange if score < 100
object_state = "orange"
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", downgrading red to orange'
)
else:
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
)
else:
# total_score == 0 or total_score <= 0
# Check if score is 0 due to false_positive (global false positive, not just outliers)
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive:
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
object_state = "green"
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_source="{score_source}", '
f'setting state to green (false positive set, score cancelled)'
)
else:
# total_score == 0, keep current state
logging.debug(
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping current state (score == 0)'
)
# define anomaly_reason
if object_state == "green":
status_message.append(
"All metric categories are in normal state for this entity"
)
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive and score_definition and "components" in score_definition:
# Extract anomaly reasons from score_definition components
for component in score_definition.get("components", []):
component_type = component.get("type")
if component_type:
mapped_reason = get_anomaly_reason_from_component_type(component_type)
if mapped_reason and mapped_reason not in anomaly_reason:
anomaly_reason.append(mapped_reason)
# If no components found, still add "none"
if not anomaly_reason:
anomaly_reason.append("none")
else:
anomaly_reason.append("none")
# if in a logical group, add the logical group message
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
else:
if isFuture is True:
status_message.append(isFutureMsg)
anomaly_reason.append("future_over_tolerance")
# logical group
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
anomaly_reason.append("in_logical_group")
# Deduplicate anomaly_reason before setting it in status_message_json
# This prevents duplicates (e.g., delay_threshold_breached added multiple times for multiple metrics)
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# deduplicate anomaly_reason to avoid duplicates
anomaly_reason = list(set(anomaly_reason))
# form status_message_json
status_message_json["status_message"] = status_message
status_message_json["anomaly_reason"] = anomaly_reason
# Add score information to status_message_json for UI display (sorted alphabetically)
# Use total_score if calculated (hybrid scoring), otherwise use base score
if total_score is not None:
status_message_json["score"] = float(total_score)
# Update record score to reflect the calculated total_score for UI consistency
record["score"] = float(total_score)
# Add score definition for drilldown modal
if score_definition:
status_message_json["score_definition"] = score_definition
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
elif score is not None:
status_message_json["score"] = float(score)
if score_outliers is not None:
status_message_json["score_outliers"] = float(score_outliers)
if total_score is not None:
status_message_json["total_score"] = float(total_score)
# get disruption_duration
if not disruption_queue_record:
record["disruption_min_time_sec"] = 0
else:
logger.debug(
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
)
disruption_object_state = disruption_queue_record.get("object_state", "green")
try:
disruption_min_time_sec = int(
disruption_queue_record.get("disruption_min_time_sec", 0)
)
except:
disruption_min_time_sec = 0
# add to the record
record["disruption_min_time_sec"] = disruption_min_time_sec
try:
disruption_start_epoch = float(
disruption_queue_record.get("disruption_start_epoch", 0)
)
except:
disruption_start_epoch = 0
# Case 1: Entity is no longer in alert state (not red)
if object_state != "red":
# Only update if we were previously tracking a disruption
if disruption_object_state == "red":
disruption_queue_record["object_state"] = object_state
disruption_queue_record["disruption_start_epoch"] = 0
disruption_queue_record["mtime"] = time.time()
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
return object_state, status_message, status_message_json, anomaly_reason
# Case 2: Entity is in alert state (red)
if object_state == "red":
current_time = time.time()
# If this is a new disruption, start tracking it
if disruption_object_state != "red":
disruption_queue_record["object_state"] = "red"
disruption_queue_record["disruption_start_epoch"] = current_time
disruption_queue_record["mtime"] = current_time
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
# For new disruptions, if min time is set, show as blue with message
if disruption_min_time_sec > 0:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
return object_state, status_message, status_message_json, anomaly_reason
# If we're already tracking a disruption, check duration
if disruption_min_time_sec > 0:
try:
disruption_duration = current_time - disruption_start_epoch
except Exception as e:
logger.error(f"error calculating disruption_duration: {e}")
disruption_duration = 0
# If duration hasn't breached threshold, show as blue with message
if disruption_duration < disruption_min_time_sec:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
# Also ensure status_message_json is updated with deduplicated list (safety check)
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# deduplicate anomaly_reason to avoid duplicates (e.g., delay_threshold_breached added multiple times for multiple metrics)
anomaly_reason = list(set(anomaly_reason))
# Update status_message_json to ensure it has the deduplicated list
status_message_json["anomaly_reason"] = anomaly_reason
# return
logging.debug(
f'set_mhm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
return object_state, status_message, status_message_json, anomaly_reason
def set_flx_status(
logger,
splunkd_uri,
session_key,
tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
threshold_alert,
threshold_messages,
disruption_queue_collection,
disruption_queue_record,
source_handler=None,
monitoring_anomaly_reason=None,
score=None,
score_outliers=None,
threshold_scores=None,
vtenant_account=None,
):
"""
Create a function called set_flx_status:
- arguments: record, isOutlier, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
- returns:
object_state (string): blue, orange, green, red
anomaly_reason (list): list of short code reasons why the object is in anomaly
status_message (list): list of long description reasons why the object is in anomaly
- behaviour:
object_state:
green if:
isOutlier is 1
isFuture is False
isUnderMonitoring is True
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
isUnderLatencyAlert is False
isUnderDelayAlert is False
blue if:
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
orange if:
All green conditions are met except for isFuture which would be True
red if:
Any of the green conditions are not met, and blue conditions and orange conditions are not met
anomaly_reason:
if object_state is green, anomnaly_reason is None
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
"""
# init status_message and anomaly_reason
status_message = []
anomaly_reason = []
# upstream anomaly_reason
upstream_anomaly_reason = record.get("anomaly_reason", [])
if isinstance(upstream_anomaly_reason, str):
upstream_anomaly_reason = [upstream_anomaly_reason]
# init status_message_json
status_message_json = {}
# status and status_description are used to compose the anomaly_reason
status = record.get("status", "unknown")
try:
status = int(status)
except Exception as e:
pass
status_description = record.get("status_description", "unknown")
# Capture original upstream status from the search before any modifications
# In real-time processing, check metrics.status first (original search status)
# If not available, fall back to the current status field
original_upstream_status = None
metrics = record.get("metrics", {})
if isinstance(metrics, str):
try:
metrics = json.loads(metrics)
except Exception:
metrics = {}
if isinstance(metrics, dict) and "status" in metrics:
try:
original_upstream_status = int(metrics["status"])
except Exception:
pass
# If metrics.status not available, use the current status field as fallback
if original_upstream_status is None:
original_upstream_status = status
# for flx, object_state can be defined upstream based on the status
object_state = "unknown"
if status == 1:
object_state = "green"
elif status == 2:
object_state = "red"
elif status == 3:
object_state = "orange"
else:
pass
# for flx, attempt to retrieve extra_attributes, if present attempt to load as an object
extra_attributes = record.get("extra_attributes", {})
if isinstance(extra_attributes, str):
if len(extra_attributes) > 0:
try:
extra_attributes = json.loads(extra_attributes)
except Exception as e:
logger.error(
f"Error in processing extra_attributes: {extra_attributes}. Error: {str(e)}"
)
else:
extra_attributes = {}
# if source_handler is not trackmedecisionmaker, consider the upstream status as the source of truth
if source_handler == "trackmedecisionmaker":
if status != 1 and not (
len(upstream_anomaly_reason) == 1
and upstream_anomaly_reason[0] == "inactive"
):
if "status_not_met" not in upstream_anomaly_reason:
upstream_anomaly_reason.append("status_not_met")
logging.debug(
f'source_handler="{source_handler}", entering set_flx_status, object="{record.get("object")}", object_state="{object_state}", status="{status}", upstream_anomaly_reason="{upstream_anomaly_reason}"'
)
#
# Threshold alert management
#
# if threshold_alert is True, then object_state is red
record["threshold_alert"] = threshold_alert
record["threshold_messages"] = threshold_messages
if threshold_alert == 1:
object_state = "red"
status = 2
anomaly_reason.append("threshold_alert")
for threshold_message in threshold_messages:
status_message.append(threshold_message)
# in record, update status_description and status_description_short with a CSV string of the threshold_messages
record["status_description"] = ",".join(threshold_messages)
record["status_description_short"] = ",".join(threshold_messages)
else:
# remove threshold_alert from upstream_anomaly_reason, if present
if "threshold_alert" in upstream_anomaly_reason:
upstream_anomaly_reason.remove("threshold_alert")
# if the unique anomaly reason was threshold_alert, then object_state is green
# BUT only if original_upstream_status is 1 (good status)
# If original_upstream_status != 1 (status_not_met), we should keep the red/orange state
if len(upstream_anomaly_reason) == 0 and original_upstream_status == 1:
object_state = "green"
status = 1
#
# Logical group management
#
(
isUnderLogicalGroup,
LogicalGroupStateInAlert,
LogicalGroupMsg,
) = get_and_manage_logical_group_status(
splunkd_uri,
session_key,
tenant_id,
record.get("object"),
object_state,
record.get("object_group_key"),
object_logical_group_dict,
)
# log debug
logger.debug(
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
)
# get status_description_short and ensures it always has a value
status_description_short = record.get("status_description_short", None)
if not status_description_short:
record["status_description_short"] = status_description
status_description_short = status_description
# Verify isOutlier
# Only set red if isOutlier == 1 AND score_outliers > 0 (or score_outliers is None for legacy)
# If score_outliers <= 0, outliers are suppressed (false positive) and should not cause red state
if isOutlier == 1:
if score_outliers is not None:
if score_outliers > 0:
# Outliers present with positive score
if score_outliers >= 100:
object_state = "red"
status = 2
else:
# score_outliers > 0 and < 100, set to orange
object_state = "orange"
status = 3
# If score_outliers <= 0, don't set state to red/orange (outliers suppressed)
else:
# Legacy behavior: if score_outliers is not provided, use isOutlier
object_state = "red"
status = 2
else:
pass
# if object_state is red but isUnderMonitoring is False, then object_state is orange
if object_state == "red":
if isUnderMonitoring is False:
object_state = "orange"
status = 3
#
# Hybrid scoring: Apply score-based logic
# Outliers are handled separately via score_outliers in get_outliers_status
#
total_score = None
score_definition = {}
if score is not None:
# Calculate total score with static increments for anomalies
base_score = float(score) if score is not None else 0.0
total_score = base_score
# Build score definition to track where the score comes from
# Convert base_score to integer if it's a whole number, otherwise keep as float
if base_score == int(base_score):
score_definition["base_score"] = int(base_score)
else:
score_definition["base_score"] = base_score
score_definition["components"] = []
# Add static increments for each anomaly type
if threshold_alert == 1:
# Use threshold scores if provided, otherwise default to 100
if threshold_scores and len(threshold_scores) > 0:
# Sum all threshold scores (multiple thresholds can be breached)
increment = sum(threshold_scores)
else:
# Default to 100 for backward compatibility
increment = 100
total_score += increment
score_definition["components"].append({
"type": "threshold_breach",
"score": increment,
"description": "Threshold alert breached"
})
# Add outlier score if present
if score_outliers is not None and score_outliers > 0:
score_definition["score_outliers"] = float(score_outliers)
# Add score sources if available
score_source = record.get("score_source", [])
if score_source:
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Apply score-based logic:
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
# - If total_score == 0: keep current state
if total_score >= 100:
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
if object_state not in ["red", "blue"]:
object_state = "red"
status = 2
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red (score >= 100)'
)
else:
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
)
elif total_score > 0 and total_score < 100:
# If score > 0 and < 100, entity should be orange (even if currently green)
if object_state == "green":
object_state = "orange"
status = 3
# Add status message about score
status_message.append(
f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
)
elif object_state == "red":
# Downgrade red to orange if score < 100
# Only apply score-based downgrade if the red state is NOT due to outliers
# (outliers with score_outliers >= 100 should still be red)
if isOutlier != 1:
object_state = "orange"
status = 3
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
)
else:
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
# in get_outliers_status, so we can still apply score-based logic
if score_outliers is not None and score_outliers < 100:
object_state = "orange"
status = 3
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'downgrading red to orange (outlier score too low)'
)
else:
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
)
else:
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
)
else:
# total_score == 0 or total_score <= 0
# Check if score is 0 due to false_positive (global false positive, not just outliers)
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive:
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
object_state = "green"
status = 1
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_source="{score_source}", '
f'setting state to green (false positive set, score cancelled)'
)
elif score_outliers is not None and score_outliers <= 0 and threshold_alert != 1 and original_upstream_status == 1:
# Outliers are suppressed (false positive), and no other issues, set to green
# BUT only if original_upstream_status is 1 (good status)
# If original_upstream_status != 1 (status_not_met), we should keep the red/orange state
object_state = "green"
status = 1
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'original_upstream_status="{original_upstream_status}", '
f'setting state to green (outliers suppressed, no other issues, upstream status is good)'
)
else:
# Keep current state if there are other issues or legacy behavior
# Also keep current state if original_upstream_status != 1 (status_not_met)
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'threshold_alert="{threshold_alert}", original_upstream_status="{original_upstream_status}", '
f'keeping current state (score == 0, may have upstream status issues)'
)
# Safeguard: If original_upstream_status == 2 (status_not_met), ensure state is not green
# (except for false_positive which is an explicit override)
if original_upstream_status == 2 and object_state == "green":
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if not has_false_positive:
# Restore red state if upstream status indicates status_not_met
object_state = "red"
status = 2
logging.debug(
f'set_flx_status, safeguard: object="{record.get("object")}", '
f'original_upstream_status="{original_upstream_status}", '
f'correcting green state back to red (status_not_met detected)'
)
# define anomaly_reason
if object_state == "green":
status_message_str = f"The entity status is complying with monitoring rules (status: {status}, status_description: {status_description})"
status_message.append(status_message_str)
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive and score_definition and "components" in score_definition:
# Extract anomaly reasons from score_definition components
for component in score_definition.get("components", []):
component_type = component.get("type")
if component_type:
mapped_reason = get_anomaly_reason_from_component_type(component_type)
if mapped_reason and mapped_reason not in anomaly_reason:
anomaly_reason.append(mapped_reason)
# If no components found, still add "none"
if not anomaly_reason:
anomaly_reason.append("none")
else:
anomaly_reason.append("none")
# if in a logical group, add the logical group message
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
#
# Inactive entities management
#
# get max_sec_inactive
max_sec_inactive = record.get("max_sec_inactive", 0)
try:
max_sec_inactive = int(max_sec_inactive)
except Exception as e:
max_sec_inactive = 0
# get the age in seconds since the latest execution
sec_since_last_execution = round(
time.time() - float(record.get("tracker_runtime")), 0
)
duration_since_last_execution = convert_seconds_to_duration(
sec_since_last_execution
)
# Check and act
if float(sec_since_last_execution) > max_sec_inactive and max_sec_inactive > 0:
status_message_str = f"This entity has been inactive for more than {duration_since_last_execution} (D+HH:MM:SS) and was not actively managed by any tracker, its status was updated automatically by the inactive entities tracker"
status_message = [status_message_str]
status_description_short = "entity is red due to inactivity"
status_description = f"The entity status is red due to inactivity, it was not actively managed by any tracker for more than {duration_since_last_execution} (D+HH:MM:SS)"
anomaly_reason = ["inactive"]
object_state = "red"
status = 2
# in this case, we need to update the status_description and status_description_short
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["object_state"] = object_state
# Add score increment for inactive if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_flx_inactive", 100)
total_score += increment
score_definition["components"].append({
"type": "inactive",
"score": increment,
"description": "Entity inactive"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
#
# end of inactive entities management
#
#
# Red status due to upstream Flex logic / Orange state
#
# Only add status_not_met if the original upstream status from the search indicates a problem
# (status != 1), not when we're orange/red purely due to score-based logic or outliers
# status_not_met should only be driven by the hybrid tracker search itself
# Don't add status_not_met if:
# 1. Entity is orange/red ONLY due to outliers (regardless of score)
# 2. Outliers score >= 100 (entity already red due to outliers)
# 3. Entity is orange/red ONLY due to threshold breaches with score < 100
# Check if entity is orange/red ONLY due to outliers
has_outliers_only = (
isOutlier == 1
and score_outliers is not None
and not threshold_alert
)
# Check if outliers score is >= 100 (entity already red due to outliers)
outliers_score_high = (
isOutlier == 1
and score_outliers is not None
and score_outliers >= 100
)
# Check if entity is orange/red ONLY due to threshold breaches with score < 100
# Calculate threshold score to check if it's < 100
threshold_score_sum = 0
if threshold_alert == 1 and threshold_scores and len(threshold_scores) > 0:
threshold_score_sum = sum(threshold_scores)
elif threshold_alert == 1:
threshold_score_sum = 100 # Default score if threshold_scores not provided
has_threshold_only_low_score = (
threshold_alert == 1
and isOutlier != 1
and threshold_score_sum < 100
)
# Only add status_not_met if:
# - Original upstream status was bad (status != 1) AND
# - Entity is in non-green state AND
# - Entity is NOT orange/red ONLY due to outliers (any score) AND
# - Outliers score is NOT >= 100 (don't add if outliers already made it red) AND
# - Entity is NOT orange/red ONLY due to threshold breaches with score < 100
if ((object_state == "red" and not threshold_alert) or object_state == "orange") and original_upstream_status != 1 and not has_outliers_only and not outliers_score_high and not has_threshold_only_low_score:
status_message_str = f"The entity status is not complying with monitoring rules (status: {status}, status_description: {status_description})"
status_message.append(status_message_str)
anomaly_reason.append("status_not_met")
# Add score increment for status_not_met if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_flx_status_not_met", 100)
total_score += increment
score_definition["components"].append({
"type": "status_not_met",
"score": increment,
"description": "Status not met"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Re-check score-based logic after adding status_not_met score
# If total_score >= 100, ensure entity is red (unless it's blue due to logical group)
if total_score >= 100:
if object_state not in ["red", "blue"]:
object_state = "red"
status = 2
logging.debug(
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red after status_not_met (score >= 100)'
)
# Other statements
# Check for outliers: report isOutlier in status_message for both red and orange states
# Add ml_outliers_detection to anomaly_reason for all outlier cases
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
outlier_reasons = record.get("isOutlierReason", [])
if outlier_reasons:
if isinstance(outlier_reasons, list):
# Join the list elements into a single string
outlier_reasons_str = " | ".join(outlier_reasons)
status_message.append(outlier_reasons_str)
else:
# If it's not a list, append it directly
status_message.append(outlier_reasons)
# Add ml_outliers_detection to anomaly_reason for all outlier cases
if "ml_outliers_detection" not in anomaly_reason:
anomaly_reason.append("ml_outliers_detection")
# Add status message for orange state (score_outliers > 0 and < 100)
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
base_score = float(score) if score is not None else 0.0
status_message.append(
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
if object_state == "red":
# Monitoring time policy, add the message first then the anomaly reason
if isUnderMonitoring is False:
status_message.append(isUnderMonitoringMsg)
# Use new monitoring anomaly reason if provided
if monitoring_anomaly_reason:
anomaly_reason.append(monitoring_anomaly_reason)
else:
anomaly_reason.append("out_of_monitoring_times")
# Note: out_of_monitoring_times is not scored as an anomaly - it's a protective mechanism
# that prevents entities from turning red when outside their monitoring window
# logical group
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
anomaly_reason.append("in_logical_group")
# Note: in_logical_group is not scored as an anomaly - it's a protective mechanism
# that prevents entities from turning red when the logical group is compliant
#
# Logical group management (object_state is red but in a logical group which is not in alert)
#
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
if object_state == "red" and isUnderLogicalGroup is True:
if LogicalGroupStateInAlert is False:
object_state = "blue"
status = 3
#
# Out of monitoring days and hours management
#
# if object_state is red but isUnderMonitoring is False, then object_state is orange
# However, if total_score >= 100, keep red state (score-based logic takes precedence)
if object_state == "red":
if isUnderMonitoring is False:
# Don't downgrade to orange if score >= 100 (score-based logic takes precedence)
if total_score is None or total_score < 100:
object_state = "orange"
status = 3
# update status, object_state, anomaly_reason and metrics
record["status"] = status
record["object_state"] = object_state
record["anomaly_reason"] = anomaly_reason
# ensure status metric in metrics is updated
try:
metrics_record = record.get("metrics", {})
if isinstance(metrics_record, str):
metrics_record = json.loads(metrics_record)
metrics_record["status"] = status
record["metrics"] = json.dumps(metrics_record)
except Exception as e:
pass
status_message_json["status_message"] = status_message
status_message_json["anomaly_reason"] = anomaly_reason
if extra_attributes:
status_message_json["extra_attributes"] = extra_attributes
# Add score information to status_message_json for UI display (sorted alphabetically)
# Use total_score if calculated (hybrid scoring), otherwise use base score
if total_score is not None:
status_message_json["score"] = float(total_score)
# Update record score to reflect the calculated total_score for UI consistency
record["score"] = float(total_score)
# Add score definition for drilldown modal
if score_definition:
status_message_json["score_definition"] = score_definition
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
elif score is not None:
status_message_json["score"] = float(score)
if score_outliers is not None:
status_message_json["score_outliers"] = float(score_outliers)
if total_score is not None:
status_message_json["total_score"] = float(total_score)
# get disruption_duration
if not disruption_queue_record:
record["disruption_min_time_sec"] = 0
else:
logger.debug(
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
)
disruption_object_state = disruption_queue_record.get("object_state", "green")
try:
disruption_min_time_sec = int(
disruption_queue_record.get("disruption_min_time_sec", 0)
)
except:
disruption_min_time_sec = 0
# add to the record
record["disruption_min_time_sec"] = disruption_min_time_sec
try:
disruption_start_epoch = float(
disruption_queue_record.get("disruption_start_epoch", 0)
)
except:
disruption_start_epoch = 0
# Case 1: Entity is no longer in alert state (not red)
if object_state != "red":
# Only update if we were previously tracking a disruption
if disruption_object_state == "red":
disruption_queue_record["object_state"] = object_state
disruption_queue_record["disruption_start_epoch"] = 0
disruption_queue_record["mtime"] = time.time()
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
return object_state, status_message, status_message_json, anomaly_reason
# Case 2: Entity is in alert state (red)
if object_state == "red":
current_time = time.time()
# If this is a new disruption, start tracking it
if disruption_object_state != "red":
disruption_queue_record["object_state"] = "red"
disruption_queue_record["disruption_start_epoch"] = current_time
disruption_queue_record["mtime"] = current_time
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
# For new disruptions, if min time is set, show as blue with message
if disruption_min_time_sec > 0:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
return object_state, status_message, status_message_json, anomaly_reason
# If we're already tracking a disruption, check duration
if disruption_min_time_sec > 0:
try:
disruption_duration = current_time - disruption_start_epoch
except Exception as e:
logger.error(f"error calculating disruption_duration: {e}")
disruption_duration = 0
# If duration hasn't breached threshold, show as blue with message
if disruption_duration < disruption_min_time_sec:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# return
logging.debug(
f'set_flx_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
return object_state, status_message, status_message_json, anomaly_reason
def set_fqm_status(
logger,
splunkd_uri,
session_key,
tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
threshold_alert,
threshold_messages,
disruption_queue_collection,
disruption_queue_record,
source_handler=None,
monitoring_anomaly_reason=None,
score=None,
score_outliers=None,
threshold_scores=None,
vtenant_account=None,
):
"""
Create a function called set_fqm_status:
- arguments: record, isOutlier, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
- returns:
object_state (string): blue, orange, green, red
anomaly_reason (list): list of short code reasons why the object is in anomaly
status_message (list): list of long description reasons why the object is in anomaly
- behaviour:
object_state:
green if:
isOutlier is 1
isFuture is False
isUnderMonitoring is True
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
isUnderLatencyAlert is False
isUnderDelayAlert is False
blue if:
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
orange if:
All green conditions are met except for isFuture which would be True
red if:
Any of the green conditions are not met, and blue conditions and orange conditions are not met
anomaly_reason:
if object_state is green, anomnaly_reason is None
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
"""
# init status_message and anomaly_reason
status_message = []
anomaly_reason = []
# get percent_success
percent_success = record.get("percent_success", None)
if percent_success is not None:
percent_success = float(percent_success)
if percent_success == int(percent_success):
percent_success = int(percent_success)
else:
percent_success = 0
# get percent_coverage
percent_coverage = record.get("percent_coverage", None)
if percent_coverage is not None:
percent_coverage = float(percent_coverage)
if percent_coverage == int(percent_coverage):
percent_coverage = int(percent_coverage)
else:
percent_coverage = 0
# get ields_quality_summary JSON, and load as an object
fields_quality_summary = record.get("fields_quality_summary", {})
if isinstance(fields_quality_summary, str):
try:
fields_quality_summary = json.loads(fields_quality_summary)
except Exception as e:
fields_quality_summary = {}
else:
fields_quality_summary = {}
# get total_fields_passed and total_fields_failed (for the global entity)
if fields_quality_summary:
total_fields_passed = fields_quality_summary.get("total_fields_passed", 0)
if isinstance(total_fields_passed, str):
try:
total_fields_passed = int(total_fields_passed)
except Exception as e:
total_fields_passed = 0
total_fields_failed = fields_quality_summary.get("total_fields_failed", 0)
if isinstance(total_fields_failed, str):
try:
total_fields_failed = int(total_fields_failed)
except Exception as e:
total_fields_failed = 0
# set fqm_type (if @global in object, then fqm_type is global, otherwise it is field)
fqm_type = "field"
if "@global" in record.get("object", ""):
fqm_type = "global"
# set object_description
object_description = {}
# 1 - try to load the content of fields_quality_summary (JSON as string)
# 2 - iterate over the JSON and look for fields metadata.*
# 3 - add them to the record as metadata_<fieldname> (instead of metadata.<fieldname>)
if "fields_quality_summary" in record:
try:
fields_quality_summary = json.loads(record["fields_quality_summary"])
for field in fields_quality_summary:
if field.startswith("metadata."):
newfield_name = field.replace("metadata.", "metadata_")
object_description[f"{newfield_name}"] = fields_quality_summary[field]
except:
pass
# add field
object_description["field"] = record.get('fieldname')
object_description = json.dumps(object_description, indent=2)
record["object_description"] = object_description
# init status_message_json
status_message_json = {}
# init status, status_description, status_description_short, object_state
status = 1
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
elif fqm_type == "global":
status_description = f"The global entity is complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
object_state = "green"
# mandatorily update the record
record["status"] = status
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["object_state"] = object_state
#
# Threshold alert management
#
# In fqm. the threshold is mandatory and the root logic of the detection
record["threshold_alert"] = threshold_alert
record["threshold_messages"] = threshold_messages
if threshold_alert == 1:
object_state = "red"
status = 2
anomaly_reason.append("threshold_alert")
for threshold_message in threshold_messages:
status_message.append(threshold_message)
# Update status_description for alert state
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
# include additional messages in status_message depending on the description field in fields_quality_summary
if fields_quality_summary:
quality_results_description = fields_quality_summary.get("quality_results_description", [])
for description_item in quality_results_description:
if description_item.startswith("category: Field does not exist"):
status_message.append("The field has failed to pass quality verifications (is missing), review the results from the entity field view to troubleshoot these issues")
elif description_item.startswith("category: Field exists but contains 'unknown'"):
status_message.append("The field has failed to pass quality verifications (contains unknown values), review the results from the entity field view to troubleshoot these issues")
elif description_item.startswith("category: Field is empty"):
status_message.append("The field has failed to pass quality verifications (is empty), review the results from the entity field view to troubleshoot these issues")
elif description_item.startswith("category: Field is 'unknown'"):
status_message.append("The field has failed to pass quality verifications (is unknown), review the results from the entity field view to troubleshoot these issues")
elif description_item.startswith("category: Field exists but value does not match the required pattern"):
status_message.append("The field has failed to pass the regex pattern validation, review the results from the Search not matching regex from the entity field view to extract the list of values that do not match the required pattern")
elif description_item.startswith("category: Field exists but one or more values in the list do not match the required pattern"):
status_message.append("The field has failed to pass the regex pattern validation (list values), review the results from the Search not matching regex from the entity field view to extract the list of values that do not match the required pattern")
elif description_item.startswith("category: Field does not exist but is allowed to be missing"):
# Skip this category as it's a success case
continue
elif description_item.startswith("category: Field is empty but is allowed to be empty"):
# Skip this category as it's a success case
continue
elif description_item.startswith("category: Field exists and is valid"):
# Skip this category as it's a success case
continue
elif fqm_type == "global":
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
# include an additional message in status_message if total_fields_failed is greater than 0, including the number of fields that failed
if total_fields_failed > 0:
if total_fields_failed == 1:
status_message.append(f"The global entity has {total_fields_failed} field that failed to pass quality verifications (failed field: {fields_quality_summary.get('failed_fields', [])}), review the results from the entity field view to troubleshoot these issues")
else:
status_message.append(f"The global entity has {total_fields_failed} fields that failed to pass quality verifications (failed fields: {fields_quality_summary.get('failed_fields', [])}), review the results from the entity field view to troubleshoot these issues")
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["status"] = status
record["object_state"] = object_state
#
# Logical group management
#
(
isUnderLogicalGroup,
LogicalGroupStateInAlert,
LogicalGroupMsg,
) = get_and_manage_logical_group_status(
splunkd_uri,
session_key,
tenant_id,
record.get("object"),
object_state,
record.get("object_group_key"),
object_logical_group_dict,
)
# log debug
logger.debug(
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
)
# Verify isOutlier
# Only set red if isOutlier == 1 AND score_outliers > 0 (or score_outliers is None for legacy)
# If score_outliers <= 0, outliers are suppressed (false positive) and should not cause red state
if isOutlier == 1:
if score_outliers is not None:
if score_outliers > 0:
# Outliers present with positive score
if score_outliers >= 100:
object_state = "red"
status = 2
else:
# score_outliers > 0 and < 100, set to orange
object_state = "orange"
status = 3
# If score_outliers <= 0, don't set state to red/orange (outliers suppressed)
else:
# Legacy behavior: if score_outliers is not provided, use isOutlier
object_state = "red"
status = 2
else:
pass
# if object_state is red but isUnderMonitoring is False, then object_state is orange
if object_state == "red":
if isUnderMonitoring is False:
object_state = "orange"
status = 3
# Update status_description for orange state
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
elif fqm_type == "global":
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["status"] = status
record["object_state"] = object_state
#
# Hybrid scoring: Apply score-based logic
# Outliers are handled separately via score_outliers in get_outliers_status
#
total_score = None
score_definition = {}
if score is not None:
# Calculate total score with static increments for anomalies
base_score = float(score) if score is not None else 0.0
total_score = base_score
# Build score definition to track where the score comes from
# Convert base_score to integer if it's a whole number, otherwise keep as float
if base_score == int(base_score):
score_definition["base_score"] = int(base_score)
else:
score_definition["base_score"] = base_score
score_definition["components"] = []
# Add static increments for each anomaly type
if threshold_alert == 1:
# Use threshold scores if provided, otherwise default to 100
if threshold_scores and len(threshold_scores) > 0:
# Sum all threshold scores (multiple thresholds can be breached)
increment = sum(threshold_scores)
else:
# Default to 100 for backward compatibility
increment = 100
total_score += increment
score_definition["components"].append({
"type": "threshold_breach",
"score": increment,
"description": "Threshold alert breached"
})
# Add outlier score if present
if score_outliers is not None and score_outliers > 0:
score_definition["score_outliers"] = float(score_outliers)
# Add score sources if available
score_source = record.get("score_source", [])
if score_source:
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Apply score-based logic:
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
# - If total_score == 0: keep current state
if total_score >= 100:
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
if object_state not in ["red", "blue"]:
object_state = "red"
status = 2
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red (score >= 100)'
)
else:
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
)
elif total_score > 0 and total_score < 100:
# If score > 0 and < 100, entity should be orange (even if currently green)
if object_state == "green":
object_state = "orange"
status = 3
# Update status_description for orange state
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
elif fqm_type == "global":
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["status"] = status
record["object_state"] = object_state
# Add status message about score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
# Add outlier context if outliers are present
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
)
elif object_state == "red":
# Downgrade red to orange if score < 100
# Only apply score-based downgrade if the red state is NOT due to outliers
# (outliers with score_outliers >= 100 should still be red)
if isOutlier != 1:
object_state = "orange"
status = 3
# Update status_description for orange state
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
elif fqm_type == "global":
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["status"] = status
record["object_state"] = object_state
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
)
else:
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
# in get_outliers_status, so we can still apply score-based logic
if score_outliers is not None and score_outliers < 100:
object_state = "orange"
status = 3
# Update status_description for orange state
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
elif fqm_type == "global":
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["status"] = status
record["object_state"] = object_state
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'downgrading red to orange (outlier score too low)'
)
else:
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
)
else:
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
)
else:
# total_score == 0 or total_score <= 0
# Check if score is 0 due to false_positive (global false positive, not just outliers)
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive:
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
object_state = "green"
status = 1
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_source="{score_source}", '
f'setting state to green (false positive set, score cancelled)'
)
elif score_outliers is not None and score_outliers <= 0 and threshold_alert != 1:
# Outliers are suppressed (false positive), and no other issues, set to green
object_state = "green"
status = 1
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'setting state to green (outliers suppressed, no other issues)'
)
else:
# Keep current state if there are other issues or legacy behavior
logging.debug(
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'threshold_alert="{threshold_alert}", keeping current state (score == 0)'
)
# define anomaly_reason
if object_state == "green":
if fqm_type == "field":
status_message_str = f"The field {record.get('fieldname')} is complying with monitoring rules, % success: {percent_success}"
elif fqm_type == "global":
status_message_str = f"The global entity is complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_message.append(status_message_str)
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive and score_definition and "components" in score_definition:
# Extract anomaly reasons from score_definition components
for component in score_definition.get("components", []):
component_type = component.get("type")
if component_type:
mapped_reason = get_anomaly_reason_from_component_type(component_type)
if mapped_reason and mapped_reason not in anomaly_reason:
anomaly_reason.append(mapped_reason)
# If no components found, still add "none"
if not anomaly_reason:
anomaly_reason.append("none")
else:
anomaly_reason.append("none")
# if in a logical group, add the logical group message
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
#
# Inactive entities management
#
# get max_sec_inactive
max_sec_inactive = record.get("max_sec_inactive", 0)
try:
max_sec_inactive = int(max_sec_inactive)
except Exception as e:
max_sec_inactive = 0
# get the age in seconds since the latest execution
sec_since_last_execution = round(
time.time() - float(record.get("tracker_runtime")), 0
)
duration_since_last_execution = convert_seconds_to_duration(
sec_since_last_execution
)
# Check and act
if float(sec_since_last_execution) > max_sec_inactive and max_sec_inactive > 0:
status_message_str = f"This entity has been inactive for more than {duration_since_last_execution} (D+HH:MM:SS) and was not actively managed by any tracker, its status was updated automatically by the inactive entities tracker"
status_message = [status_message_str]
status_description_short = "entity is red due to inactivity"
status_description = f"The entity status is red due to inactivity, it was not actively managed by any tracker for more than {duration_since_last_execution} (D+HH:MM:SS)"
anomaly_reason = ["inactive"]
object_state = "red"
status = 2
# in this case, we need to update the status_description and status_description_short
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["object_state"] = object_state
#
# end of inactive entities management
#
#
# Red status due to upstream logic / Orange state
#
if (object_state == "red" and not threshold_alert) or object_state == "orange":
status_message_str = f"The entity status is not complying with monitoring rules (status: {status}, status_description: {status_description})"
status_message.append(status_message_str)
anomaly_reason.append("status_not_met")
# Update status_description for alert state if not already set
if fqm_type == "field":
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
elif fqm_type == "global":
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
record["status_description"] = status_description
record["status_description_short"] = status_description_short
record["status"] = status
record["object_state"] = object_state
# Add score increment for status_not_met if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_fqm_status_not_met", 100)
total_score += increment
score_definition["components"].append({
"type": "status_not_met",
"score": increment,
"description": "Status not met"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Other statements
if object_state == "red":
# Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
outlier_reasons = record.get("isOutlierReason", [])
if outlier_reasons:
if isinstance(outlier_reasons, list):
# Join the list elements into a single string
outlier_reasons_str = " | ".join(outlier_reasons)
status_message.append(outlier_reasons_str)
else:
# If it's not a list, append it directly
status_message.append(outlier_reasons)
# Add ml_outliers_detection to anomaly_reason for all outlier cases
if "ml_outliers_detection" not in anomaly_reason:
anomaly_reason.append("ml_outliers_detection")
# Add status message for orange state (score_outliers > 0 and < 100)
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
base_score = float(score) if score is not None else 0.0
status_message.append(
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
# Monitoring time policy, add the message first then the anomaly reason
if isUnderMonitoring is False:
status_message.append(isUnderMonitoringMsg)
# Use new monitoring anomaly reason if provided
if monitoring_anomaly_reason:
anomaly_reason.append(monitoring_anomaly_reason)
# logical group
if isUnderLogicalGroup is True:
status_message.append(LogicalGroupMsg)
anomaly_reason.append("in_logical_group")
#
# Logical group management (object_state is red but in a logical group which is not in alert)
#
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
if object_state == "red" and isUnderLogicalGroup is True:
if LogicalGroupStateInAlert is False:
object_state = "blue"
status = 3
#
# Out of monitoring days and hours management
#
# if object_state is red but isUnderMonitoring is False, then object_state is orange
if object_state == "red":
if isUnderMonitoring is False:
object_state = "orange"
status = 3
# update status, object_state, anomaly_reason and metrics
record["status"] = status
record["object_state"] = object_state
record["anomaly_reason"] = anomaly_reason
# ensure status metric in metrics is updated
try:
metrics_record = record.get("metrics", {})
if isinstance(metrics_record, str):
metrics_record = json.loads(metrics_record)
metrics_record["status"] = status
record["metrics"] = json.dumps(metrics_record)
except Exception as e:
pass
status_message_json["status_message"] = status_message
status_message_json["anomaly_reason"] = anomaly_reason
# Add score information to status_message_json for UI display (sorted alphabetically)
# Use total_score if calculated (hybrid scoring), otherwise use base score
if total_score is not None:
status_message_json["score"] = float(total_score)
# Update record score to reflect the calculated total_score for UI consistency
record["score"] = float(total_score)
# Add score definition for drilldown modal
if score_definition:
status_message_json["score_definition"] = score_definition
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
elif score is not None:
status_message_json["score"] = float(score)
if score_outliers is not None:
status_message_json["score_outliers"] = float(score_outliers)
if total_score is not None:
status_message_json["total_score"] = float(total_score)
# handle fields_quality_summary
try:
fields_quality_summary = record.get("fields_quality_summary", {})
if isinstance(fields_quality_summary, str):
fields_quality_summary = json.loads(fields_quality_summary)
except Exception as e:
fields_quality_summary = {}
pass
if fields_quality_summary:
status_message_json["fields_quality_summary"] = fields_quality_summary
# get disruption_duration
if not disruption_queue_record:
record["disruption_min_time_sec"] = 0
else:
logger.debug(
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
)
disruption_object_state = disruption_queue_record.get("object_state", "green")
try:
disruption_min_time_sec = int(
disruption_queue_record.get("disruption_min_time_sec", 0)
)
except:
disruption_min_time_sec = 0
# add to the record
record["disruption_min_time_sec"] = disruption_min_time_sec
try:
disruption_start_epoch = float(
disruption_queue_record.get("disruption_start_epoch", 0)
)
except:
disruption_start_epoch = 0
# Case 1: Entity is no longer in alert state (not red)
if object_state != "red":
# Only update if we were previously tracking a disruption
if disruption_object_state == "red":
disruption_queue_record["object_state"] = object_state
disruption_queue_record["disruption_start_epoch"] = 0
disruption_queue_record["mtime"] = time.time()
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
return object_state, status_message, status_message_json, anomaly_reason
# Case 2: Entity is in alert state (red)
if object_state == "red":
current_time = time.time()
# If this is a new disruption, start tracking it
if disruption_object_state != "red":
disruption_queue_record["object_state"] = "red"
disruption_queue_record["disruption_start_epoch"] = current_time
disruption_queue_record["mtime"] = current_time
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
# For new disruptions, if min time is set, show as blue with message
if disruption_min_time_sec > 0:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
return object_state, status_message, status_message_json, anomaly_reason
# If we're already tracking a disruption, check duration
if disruption_min_time_sec > 0:
try:
disruption_duration = current_time - disruption_start_epoch
except Exception as e:
logger.error(f"error calculating disruption_duration: {e}")
disruption_duration = 0
# If duration hasn't breached threshold, show as blue with message
if disruption_duration < disruption_min_time_sec:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# return
logging.debug(
f'set_fqm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
return object_state, status_message, status_message_json, anomaly_reason
def set_wlk_status(
logger,
splunkd_uri,
session_key,
tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
disruption_queue_collection,
disruption_queue_record,
source_handler=None,
monitoring_anomaly_reason=None,
score=None,
score_outliers=None,
vtenant_account=None,
):
"""
Create a function called set_wlk_status:
- arguments: record, isOutlier, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
- returns:
object_state (string): blue, orange, green, red
anomaly_reason (list): list of short code reasons why the object is in anomaly
status_message (list): list of long description reasons why the object is in anomaly
- behaviour:
object_state:
green if:
isOutlier is 1
isFuture is False
isUnderMonitoring is True
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
isUnderLatencyAlert is False
isUnderDelayAlert is False
blue if:
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
orange if:
All green conditions are met except for isFuture which would be True
red if:
Any of the green conditions are not met, and blue conditions and orange conditions are not met
anomaly_reason:
if object_state is green, anomnaly_reason is None
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
"""
# init status_message and anomaly_reason
status_message = []
anomaly_reason = []
# init status_message_json
status_message_json = {}
# for wlk, first retrieve object_state which is defined upstream
object_state = record.get("object_state", "green")
# status and status_description are used to compose the anomaly_reason
status = record.get("status", "unknown")
status_description = record.get("status_description", "unknown")
# Verify isOutlier
# Only set red if isOutlier == 1 AND score_outliers > 0 (or score_outliers is None for legacy)
# If score_outliers <= 0, outliers are suppressed (false positive) and should not cause red state
if isOutlier == 1:
if score_outliers is not None:
if score_outliers > 0:
# Outliers present with positive score
if score_outliers >= 100:
object_state = "red"
else:
# score_outliers > 0 and < 100, set to orange
object_state = "orange"
# If score_outliers <= 0, don't set state to red/orange (outliers suppressed)
else:
# Legacy behavior: if score_outliers is not provided, use isOutlier
object_state = "red"
else:
pass
# for wlk, get various functional fields used for the anomaly_reason and status_message definition
# skipping KPis: skipped_pct, skipped_pct_last_60m, skipped_pct_last_4h, skipped_pct_last_24h
try:
skipped_pct = float(record.get("skipped_pct", 0))
except Exception as e:
skipped_pct = 0
try:
skipped_pct_last_60m = float(record.get("skipped_pct_last_60m", 0))
except Exception as e:
skipped_pct_last_60m = 0
try:
skipped_pct_last_4h = float(record.get("skipped_pct_last_4h", 0))
except Exception as e:
skipped_pct_last_4h = 0
try:
skipped_pct_last_24h = float(record.get("skipped_pct_last_24h", 0))
except Exception as e:
skipped_pct_last_24h = 0
# similarly, load:
# count_errors, count_errors_last_60m, count_errors_last_4h, count_errors_last_24h
try:
count_errors = int(record.get("count_errors", 0))
except Exception as e:
count_errors = 0
try:
count_errors_last_60m = int(record.get("count_errors_last_60m", 0))
except Exception as e:
count_errors_last_60m = 0
try:
count_errors_last_4h = int(record.get("count_errors_last_4h", 0))
except Exception as e:
count_errors_last_4h = 0
try:
count_errors_last_24h = int(record.get("count_errors_last_24h", 0))
except Exception as e:
count_errors_last_24h = 0
# retrieve last_seen (epochtime) and cron_exec_sequence_sec (value in seconds)
try:
last_seen = int(record.get("last_seen", 0))
except Exception as e:
last_seen = 0
# get last_seen_datetime
if last_seen > 0:
last_seen_datetime = convert_epoch_to_datetime(last_seen)
else:
last_seen_datetime = "unknown"
try:
cron_exec_sequence_sec = int(record.get("cron_exec_sequence_sec", 0))
except Exception as e:
cron_exec_sequence_sec = 0
# calculate isDelayed (0 or 1)
# if now()-last_seen)>(cron_exec_sequence_sec+3600, isDelayed is 1
now = time.time()
if (now - last_seen) > (cron_exec_sequence_sec + 3600):
isDelayed = 1
else:
isDelayed = 0
# calculate the current delay in seconds
current_delay = now - last_seen
# get the current delay durection
current_delay_duration = convert_seconds_to_duration(current_delay)
# retrieve orphan boolean (0 or 1) and load as an integer, as well as orphan_last_check (human readable date)
try:
orphan = int(record.get("orphan", 0))
except Exception as e:
orphan = 0
orphan_last_check = record.get("orphan_last_check", "unknown")
# if object_state is red but isUnderMonitoring is False, then object_state is orange
if object_state == "red":
if isUnderMonitoring is False:
object_state = "orange"
#
# Hybrid scoring: Apply score-based logic
# Outliers are handled separately via score_outliers in get_outliers_status
#
total_score = None
score_definition = {}
if score is not None:
# Calculate total score with static increments for anomalies
base_score = float(score) if score is not None else 0.0
total_score = base_score
# Build score definition to track where the score comes from
# Convert base_score to integer if it's a whole number, otherwise keep as float
if base_score == int(base_score):
score_definition["base_score"] = int(base_score)
else:
score_definition["base_score"] = base_score
score_definition["components"] = []
# Add outlier score if present
if score_outliers is not None and score_outliers > 0:
score_definition["score_outliers"] = float(score_outliers)
# Add score sources if available
score_source = record.get("score_source", [])
if score_source:
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
# Note: Score increments for WLK anomalies are added later when anomalies are detected
# This ensures scoring happens in sync with anomaly_reason detection
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Apply score-based logic:
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
# - If total_score == 0: keep current state
if total_score >= 100:
# If score >= 100, ensure entity is red
if object_state not in ["red", "blue"]:
object_state = "red"
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting state to red (score >= 100)'
)
else:
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
)
elif total_score > 0 and total_score < 100:
# If score > 0 and < 100, entity should be orange (even if currently green)
if object_state == "green":
object_state = "orange"
# Add status message about score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
# Add outlier context if outliers are present
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
)
elif object_state == "red":
# Downgrade red to orange if score < 100
# Only apply score-based downgrade if the red state is NOT due to outliers
# (outliers with score_outliers >= 100 should still be red)
if isOutlier != 1:
object_state = "orange"
# Add status message about score when downgrading
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
if score_outliers is not None and score_outliers > 0:
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
)
else:
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
# in get_outliers_status, so we can still apply score-based logic
if score_outliers is not None and score_outliers < 100:
object_state = "orange"
# Add status message about score when downgrading due to low outlier score
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
status_message.append(score_msg)
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'downgrading red to orange (outlier score too low)'
)
else:
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
)
else:
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
)
else:
# total_score == 0 or total_score <= 0
# Check if score is 0 due to false_positive (global false positive, not just outliers)
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive:
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
object_state = "green"
status = 1
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_source="{score_source}", '
f'setting state to green (false positive set, score cancelled)'
)
elif score_outliers is not None and score_outliers <= 0:
# Outliers are suppressed (false positive), set to green
object_state = "green"
status = 1
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'setting state to green (outliers suppressed)'
)
else:
# Keep current state if legacy behavior
logging.debug(
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
f'total_score="{total_score}", score_outliers="{score_outliers}", '
f'keeping current state (score == 0)'
)
# define anomaly_reason
if object_state == "green":
status_message_str = f"The entity status is complying with monitoring rules (status: {status}, status_description: {status_description})"
status_message.append(status_message_str)
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
score_source = record.get("score_source", [])
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
has_false_positive = "false_positive" in score_source_list
if has_false_positive and score_definition and "components" in score_definition:
# Extract anomaly reasons from score_definition components
for component in score_definition.get("components", []):
component_type = component.get("type")
if component_type:
mapped_reason = get_anomaly_reason_from_component_type(component_type)
if mapped_reason and mapped_reason not in anomaly_reason:
anomaly_reason.append(mapped_reason)
# If no components found, still add "none"
if not anomaly_reason:
anomaly_reason.append("none")
else:
anomaly_reason.append("none")
else:
# Other statements
# ML Outlier: Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
outlier_reasons = record.get("isOutlierReason", [])
if outlier_reasons:
if isinstance(outlier_reasons, list):
# Join the list elements into a single string
outlier_reasons_str = " | ".join(outlier_reasons)
status_message.append(outlier_reasons_str)
else:
# If it's not a list, append it directly
status_message.append(outlier_reasons)
# Add anomaly reason for outliers (either traditional or hybrid scoring)
if "ml_outliers_detection" not in anomaly_reason:
anomaly_reason.append("ml_outliers_detection")
# Add status message for orange state (score_outliers > 0 and < 100)
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
base_score = float(score) if score is not None else 0.0
status_message.append(
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
)
# Skipping
if (
skipped_pct > 0
or skipped_pct_last_60m > 0
or skipped_pct_last_4h > 0
or skipped_pct_last_24h > 0
):
status_message.append(
f"skipping searches were detected, review and address performance issues for this search or finetune its scheduling plan to clear this alert. (skipped_pct_last_60m: {skipped_pct_last_60m}, skipped_pct_last_4h: {skipped_pct_last_4h}, skipped_pct_last_24h: {skipped_pct_last_24h})"
)
anomaly_reason.append("skipping_searches_detected")
# Add score increment for skipping searches if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_wlk_skipping_searches", 100)
total_score += increment
score_definition["components"].append({
"type": "skipping_searches_detected",
"score": increment,
"description": "Skipping searches detected"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Errors count
if (
count_errors > 0
or count_errors_last_60m > 0
or count_errors_last_4h > 0
or count_errors_last_24h > 0
):
status_message.append(
f"execution errors were detected, review and address these errors to clear this alert. (count_errors_last_60m: {count_errors_last_60m}, count_errors_last_4h: {count_errors_last_4h}, count_errors_last_24h: {count_errors_last_24h})"
)
anomaly_reason.append("execution_errors_detected")
# Add score increment for execution errors if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_wlk_execution_errors", 100)
total_score += increment
score_definition["components"].append({
"type": "execution_errors_detected",
"score": increment,
"description": "Execution errors detected"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# orphan
if orphan == 1:
status_message.append(
f"the search was detected as an orphan search which means the user owning the search is not currently a valid user (orphan: {orphan}, time check: {orphan_last_check}"
)
anomaly_reason.append("orphan_search_detected")
# Add score increment for orphan search if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_wlk_orphan_search", 100)
total_score += increment
score_definition["components"].append({
"type": "orphan_search_detected",
"score": increment,
"description": "Orphan search detected"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# delayed
if isDelayed == 1 and cron_exec_sequence_sec > 0:
status_message.append(
f"the search was detected as delayed, this means the search is not running as expected (isDelayed: {isDelayed}, last_seen: {last_seen_datetime}, cron_exec_sequence_sec: {cron_exec_sequence_sec}, current delay: {current_delay_duration} duration)"
)
anomaly_reason.append("execution_delayed")
# Add score increment for execution delayed if scoring is enabled (using VT-specific impact score)
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_wlk_execution_delayed", 100)
total_score += increment
score_definition["components"].append({
"type": "execution_delayed",
"score": increment,
"description": "Execution delayed"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# Monitoring time policy, add the message first then the anomaly reason
if isUnderMonitoring is False:
status_message.append(isUnderMonitoringMsg)
# Use new monitoring anomaly reason if provided, otherwise use legacy
if monitoring_anomaly_reason:
anomaly_reason.append(monitoring_anomaly_reason)
else:
anomaly_reason.append("out_of_monitoring_times")
# Add score increment for out of monitoring times if scoring is enabled
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_wlk_out_of_monitoring_times", 100)
total_score += increment
score_definition["components"].append({
"type": "out_of_monitoring_times",
"score": increment,
"description": "Out of monitoring times"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# if we failed to identify the reason
if len(status_message) == 0:
status_message_str = f"The entity status is not complying with monitoring rules (status: {status}, status_description: {status_description})"
status_message.append(status_message_str)
anomaly_reason.append("status_not_met")
# Add score increment for status_not_met if scoring is enabled
if score is not None and total_score is not None:
increment = get_impact_score(vtenant_account, "impact_score_wlk_status_not_met", 100)
total_score += increment
score_definition["components"].append({
"type": "status_not_met",
"score": increment,
"description": "Status not met"
})
# Convert total_score to integer if it's a whole number, otherwise keep as float
if total_score is not None:
if total_score == int(total_score):
score_definition["total_score"] = int(total_score)
else:
score_definition["total_score"] = total_score
else:
score_definition["total_score"] = total_score
# form status_message_json
status_message_json["status_message"] = status_message
status_message_json["anomaly_reason"] = anomaly_reason
# Add score information to status_message_json for UI display (sorted alphabetically)
# Use total_score if calculated (hybrid scoring), otherwise use base score
if total_score is not None:
status_message_json["score"] = float(total_score)
# Update record score to reflect the calculated total_score for UI consistency
record["score"] = float(total_score)
# Add score definition for drilldown modal
if score_definition:
status_message_json["score_definition"] = score_definition
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
elif score is not None:
status_message_json["score"] = float(score)
if score_outliers is not None:
status_message_json["score_outliers"] = float(score_outliers)
if total_score is not None:
status_message_json["total_score"] = float(total_score)
# get disruption_duration
if not disruption_queue_record:
record["disruption_min_time_sec"] = 0
else:
logger.debug(
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
)
disruption_object_state = disruption_queue_record.get("object_state", "green")
try:
disruption_min_time_sec = int(
disruption_queue_record.get("disruption_min_time_sec", 0)
)
except:
disruption_min_time_sec = 0
# add to the record
record["disruption_min_time_sec"] = disruption_min_time_sec
try:
disruption_start_epoch = float(
disruption_queue_record.get("disruption_start_epoch", 0)
)
except:
disruption_start_epoch = 0
# Case 1: Entity is no longer in alert state (not red)
if object_state != "red":
# Only update if we were previously tracking a disruption
if disruption_object_state == "red":
disruption_queue_record["object_state"] = object_state
disruption_queue_record["disruption_start_epoch"] = 0
disruption_queue_record["mtime"] = time.time()
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
return object_state, status_message, status_message_json, anomaly_reason
# Case 2: Entity is in alert state (red)
if object_state == "red":
current_time = time.time()
# If this is a new disruption, start tracking it
if disruption_object_state != "red":
disruption_queue_record["object_state"] = "red"
disruption_queue_record["disruption_start_epoch"] = current_time
disruption_queue_record["mtime"] = current_time
try:
disruption_queue_update(
disruption_queue_collection, disruption_queue_record
)
except Exception as e:
logger.error(f"error updating disruption_queue_record: {e}")
# For new disruptions, if min time is set, show as blue with message
if disruption_min_time_sec > 0:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
return object_state, status_message, status_message_json, anomaly_reason
# If we're already tracking a disruption, check duration
if disruption_min_time_sec > 0:
try:
disruption_duration = current_time - disruption_start_epoch
except Exception as e:
logger.error(f"error calculating disruption_duration: {e}")
disruption_duration = 0
# If duration hasn't breached threshold, show as blue with message
if disruption_duration < disruption_min_time_sec:
object_state = "blue"
status_message.append(
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
)
status_message_json["status_message"] = status_message
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
if isinstance(anomaly_reason, list):
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
anomaly_reason.remove("none")
# return
logging.debug(
f'set_wlk_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
return object_state, status_message, status_message_json, anomaly_reason
def ack_check(object_value, ack_collection_keys, ack_collection_dict, record):
"""
Updates record with ack information if object_value exists in ack_collection_keys
and the object categories match.
"""
ack_defaults = {
"ack_state": "inactive",
"ack_type": "N/A",
"ack_comment": "N/A",
"ack_expiration": "N/A",
"ack_mtime": "N/A",
}
if object_value in ack_collection_keys:
ack_record = ack_collection_dict.get(object_value)
# Check if ack_record exists and object_category matches
if ack_record and ack_record.get("object_category") == record.get(
"object_category"
):
# Extract ack information from ack_record
for key in ack_defaults.keys():
record[key] = ack_record.get(key, ack_defaults[key])
else:
record.update(ack_defaults)
else:
record.update(ack_defaults)
def define_state_icon_code(record):
"""
Determines the state_icon_code based on the object_state and ack_state
contained within the record dictionary.
Args:
- record (dict): A dictionary containing 'object_state' and 'ack_state'.
Returns:
- str: The state_icon_code determined based on the provided conditions.
"""
object_state = record.get("object_state")
ack_state = record.get("ack_state")
# Define a mapping based on the Splunk macro conditions
state_icon_code_mapping = {
("green", "inactive"): "001",
("green", "active"): "002",
("green", None): "003",
("red", "inactive"): "004",
("red", "active"): "005",
("red", None): "006",
("orange", "inactive"): "007",
("orange", "active"): "008",
("orange", None): "009",
("blue", "inactive"): "010",
("blue", "active"): "011",
("blue", None): "012",
}
# Fallback code if none of the conditions match
default_code = "999"
# Determine state_icon_code based on object_state and ack_state
state_icon_code = state_icon_code_mapping.get(
(object_state, ack_state),
state_icon_code_mapping.get((object_state, None), default_code),
)
return state_icon_code
def outliers_readiness(record):
"""
Updates the record with outliers_readiness, OutliersIsOk, and OutliersStatus based on the record's values.
Ensures robust handling of missing or non-integer isOutlier values.
"""
# Set outliers_readiness based on its current value
record["outliers_readiness"] = (
"True" if record.get("outliers_readiness") == "True" else "False"
)
# Safely get and convert isOutlier to an integer
try:
is_outlier = int(record.get("isOutlier", 0))
except ValueError:
# Handle case where conversion fails
is_outlier = 0
# Determine OutliersIsOk based on isOutlier
record["OutliersIsOk"] = 1 if is_outlier == 0 else 0
# Set OutliersStatus based on OutliersIsOk (always use text-based status)
record["OutliersStatus"] = "green" if record["OutliersIsOk"] == 1 else "red"
def sampling_anomaly_status(record):
"""
Updates the record with SamplingIsOk and SamplingStatus based on the isAnomaly field.
"""
# get isAnomaly
try:
isAnomaly = int(record.get("isAnomaly", 0))
except Exception as e:
isAnomaly = 0
# define SamplingIsOk
record["SamplingIsOk"] = 1 if isAnomaly == 0 else 1
# define SamplingStatus (always use text-based status)
record["SamplingStatus"] = "green" if record["SamplingIsOk"] == 1 else "red"
def logical_group_lookup(
object_value,
logicalgroup_members_collection_keys,
logicalgroup_members_collection_dict,
record,
):
"""
Updates record with Logical Group information if object_value exists in lg_collection_keys.
"""
logicalgroup_defaults = {
"object_group_key": None,
"object_group_name": None,
}
if object_value in logicalgroup_members_collection_keys:
logicalgroup_record = logicalgroup_members_collection_dict.get(object_value)
# Extract ack information from ack_record
for key in logicalgroup_defaults.keys():
record[key] = logicalgroup_record.get(key, logicalgroup_defaults[key])
else:
# for key in logicalgroup_defaults, remove the key from record if exists
for key in logicalgroup_defaults.keys():
if key in record:
del record[key]
def set_feeds_lag_summary(record, component):
"""
Generates a lag summary based on the data_last_lag_seen and data_last_ingestion_lag_seen fields
"""
if component in ["dsm", "dhm"]:
try:
data_last_lag_seen = int(
round(float(record.get("data_last_lag_seen", 0)), 0)
)
except Exception as e:
data_last_lag_seen = 0
try:
data_last_ingestion_lag_seen = int(
round(float(record.get("data_last_ingestion_lag_seen", 0)), 0)
)
except Exception as e:
data_last_ingestion_lag_seen = 0
if data_last_lag_seen > 60 or data_last_ingestion_lag_seen < -60:
data_last_lag_seen_duration = (
f"{convert_seconds_to_duration(data_last_lag_seen)}"
)
elif data_last_lag_seen == 0:
data_last_lag_seen_duration = "0 sec"
elif data_last_lag_seen < 60:
data_last_lag_seen_duration = f"-{data_last_lag_seen} sec"
else:
data_last_lag_seen_duration = f"{data_last_lag_seen} sec"
if data_last_ingestion_lag_seen > 60 or data_last_ingestion_lag_seen < -60:
data_last_ingestion_lag_seen_duration = (
f"{convert_seconds_to_duration(data_last_ingestion_lag_seen)}"
)
elif data_last_ingestion_lag_seen == 0:
data_last_ingestion_lag_seen_duration = "0 sec"
elif data_last_ingestion_lag_seen < 60:
data_last_ingestion_lag_seen_duration = (
f"-{data_last_ingestion_lag_seen} sec"
)
else:
data_last_ingestion_lag_seen_duration = (
f"{data_last_ingestion_lag_seen} sec"
)
# return
lag_summary = (
f"{data_last_lag_seen_duration} / {data_last_ingestion_lag_seen_duration}"
)
elif component in ["mhm"]:
# original logic: lag_summary= if(last_lag_seen>60, tostring(last_lag_seen, "duration"), last_lag_seen . " sec")
try:
last_lag_seen = int(round(float(record.get("last_lag_seen", 0)), 0))
except Exception as e:
last_lag_seen = 0
if last_lag_seen > 60:
lag_summary = f"{convert_seconds_to_duration(last_lag_seen)}"
else:
lag_summary = f"{last_lag_seen} sec"
return lag_summary
def set_feeds_thresholds_duration(record):
try:
data_max_delay_allowed = int(
round(float(record.get("data_max_delay_allowed", 0)), 0)
)
except Exception as e:
data_max_delay_allowed = 0
data_max_delay_allowed_duration = convert_seconds_to_duration(
data_max_delay_allowed
)
try:
data_max_lag_allowed = int(
round(float(record.get("data_max_lag_allowed", 0)), 0)
)
except Exception as e:
data_max_lag_allowed = 0
data_max_lag_allowed_duration = convert_seconds_to_duration(data_max_lag_allowed)
return data_max_delay_allowed_duration, data_max_lag_allowed_duration
def set_cim_duration(record):
try:
tracker_last_duration = int(
round(float(record.get("tracker_last_duration", 0)), 0)
)
except Exception as e:
tracker_last_duration = 0
tracker_last_duration = convert_seconds_to_duration(tracker_last_duration)
return tracker_last_duration
def dsm_sampling_lookup(
object_value, sampling_collection_keys, sampling_collection_dict, record
):
"""
Updates record with ack information if object_value exists in sampling_collection_keys
and the object categories match.
"""
sampling_defaults = {
"data_sample_feature": "enabled",
"data_sample_status_message": {
"state": "pending",
"desc": "Data Sampling is pending and has not been performed yet for this entity",
},
"data_sample_status_colour": "N/A",
"data_sample_anomaly_reason": "N/A",
}
if object_value in sampling_collection_keys:
sampling_record = sampling_collection_dict.get(object_value)
current_data_sample_feature = sampling_record.get(
"data_sample_feature", "enabled"
)
if current_data_sample_feature == "enabled":
for key in sampling_defaults.keys():
record[key] = sampling_record.get(key, sampling_defaults[key])
elif current_data_sample_feature == "disabled_auto":
for key in sampling_defaults.keys():
record[key] = sampling_record.get(key, sampling_defaults[key])
else:
sampling_fields = {
"data_sample_feature": "disabled",
"data_sample_status_message": {
"state": "disabled",
"desc": "Data sampling features are currently disabled for this entity.",
},
"data_sample_status_colour": "grey",
"data_sample_anomaly_reason": "None",
}
for key in sampling_fields.keys():
record[key] = sampling_fields[key]
else:
record.update(sampling_defaults)
def outliers_data_lookup(
key_value,
outliers_data_collection_keys,
outliers_data_collection_dict,
outliers_rules_collection_keys,
outliers_rules_collection_dict,
record,
):
"""
Updates record with outliers information if object_value exists in outliers_data_collection_keys
and the object categories match.
"""
#
# Handle data
#
outliers_data_defaults = {
"isOutlier": 0,
"isOutlierReason": "",
"models_in_anomaly": "",
}
if key_value in outliers_data_collection_keys:
outliers_data_record = outliers_data_collection_dict.get(key_value)
for key in outliers_data_defaults.keys():
record[key] = outliers_data_record.get(key, outliers_data_defaults[key])
else:
record.update(outliers_data_defaults)
#
# Handle rules
#
outliers_rules_defaults = {
"is_disabled": 0,
}
if key_value in outliers_rules_collection_keys:
outliers_rules_record = outliers_rules_collection_dict.get(key_value)
for key in outliers_rules_defaults.keys():
if key == "is_disabled":
record["OutliersDisabled"] = outliers_rules_record.get(
key, outliers_rules_defaults[key]
)
else:
record[key] = outliers_rules_record.get(
key, outliers_rules_defaults[key]
)
else:
for key, value in outliers_rules_defaults.items():
if key == "is_disabled":
record["OutliersDisabled"] = value
else:
record[key] = value
def get_coll_docs_ref(collection, docs_collection_name):
collection_records = []
collection_records_dict = {}
collection_members_list = []
collection_members_dict = {}
end = False
skip_tracker = 0
while not end:
process_collection_records = collection.data.query(skip=skip_tracker)
if process_collection_records:
for item in process_collection_records:
collection_records.append(item)
collection_records_dict[item.get("_key")] = {
"doc_note": item.get("doc_note"),
"doc_link": item.get("doc_link"),
"object_members": item.get("object", []),
}
doc_members = item.get("object", [])
# add members in collection_members_list, also create a dict per member
for member in doc_members:
if member not in collection_members_list:
collection_members_list.append(member)
collection_members_dict[member] = {
"doc_key": item.get("_key"),
"doc_note": item.get("doc_note"),
"doc_link": item.get("doc_link"),
"object_members": item.get("object", []),
}
skip_tracker += 5000
else:
end = True
return (
collection_records,
collection_records_dict,
collection_members_list,
collection_members_dict,
)
def docs_ref_lookup(
docs_is_global,
docs_note_default_global,
docs_link_default_global,
object_value,
docs_members_collection_keys,
docs_members_collection_dict,
record,
):
"""
Updates record with docs ref information if object_value exists in docs_members_collection_keys.
"""
docs_defaults = {
"doc_is_global": docs_is_global,
"doc_note": docs_note_default_global,
"doc_link": docs_link_default_global,
}
if object_value in docs_members_collection_keys:
doc_record = docs_members_collection_dict.get(object_value)
# override doc_is_global to False
record["doc_is_global"] = False
# Extract information from record
for key in docs_defaults.keys():
if key != "doc_is_global":
record[key] = doc_record.get(key, docs_defaults[key])
else:
record.update(docs_defaults)
def wlk_disabled_apps_lookup(
app_value,
apps_enablement_collection_keys,
apps_enablement_collection_dict,
record,
):
"""
Updates record with apps_disabled ref information if object_value exists in apps_enablement_collection_keys.
"""
apps_enablement_defaults = {
"app_is_enabled": "True",
}
if app_value in apps_enablement_collection_keys:
lookup_record = apps_enablement_collection_dict.get(app_value)
# Extract ack information from record
record["app_is_enabled"] = lookup_record.get("enabled", "True")
else:
record.update(apps_enablement_defaults)
def wlk_versioning_lookup(
key_value,
versioning_collection_keys,
versioning_collection_dict,
record,
):
"""
Updates record with apps_disabled ref information if object_value exists in cron_exec_sequence_sec_collection_keys.
"""
versioning_defaults = {
"cron_exec_sequence_sec": 0,
"object_description": "No description",
"versioning_available": "False",
}
# lookup and override if found, otherwise do nothing
if key_value in versioning_collection_keys:
record["versioning_available"] = "True"
lookup_record = versioning_collection_dict.get(key_value)
for key in versioning_defaults.keys():
if key == "object_description":
lookup_record_description = lookup_record.get("description", None)
# if len of lookup_record_description is 0, then use the default
if lookup_record_description:
if len(lookup_record_description) == 0:
record["object_description"] = versioning_defaults[
"object_description"
]
else:
record["object_description"] = lookup_record_description
else:
record["object_description"] = versioning_defaults[
"object_description"
]
elif key == "versioning_available":
record["versioning_available"] = "True"
else:
record[key] = lookup_record.get(key, versioning_defaults[key])
logging.debug(
f'versioning found for object="{record.get("object")}", object_key="{record.get("keyid")}", using key_value="{key_value}"'
)
else:
record.update(versioning_defaults)
logging.debug(
f'no versioning found for object="{record.get("object")}", object_key="{record.get("keyid")}", using key_value="{key_value}"'
)
def wlk_orphan_lookup(
key_value,
orphan_collection_keys,
orphan_collection_dict,
record,
):
"""
Updates record with orphan ref information if key_value exists in orphan_collection_keys.
"""
orphan_defaults = {
"orphan": 0,
}
if key_value in orphan_collection_keys:
lookup_record = orphan_collection_dict.get(key_value)
record["orphan"] = lookup_record.get("orphan", 0)
else:
record.update(orphan_defaults)
def apply_blocklist(record, blocklist_not_regex, blocklist_regex):
"""
Determines whether a record should be appended based on blocklist rules.
:param record: The record to check.
:param blocklist_not_regex: Dict of blocklist rules without regex.
:param blocklist_regex: Dict of blocklist rules with regex.
:return: True if the record should be appended, False otherwise.
"""
def match_not_regex(field_value, rule):
"""
Check if a field value matches a non-regex blocklist rule.
"""
if isinstance(field_value, list):
return any(item == rule.get("object") for item in field_value)
else:
return field_value == rule.get("object")
def match_regex(field_value, rule):
"""
Check if a field value matches a regex blocklist rule.
"""
if isinstance(field_value, list):
return any(re.match(rule.get("object"), item) for item in field_value)
else:
return re.match(rule.get("object"), field_value)
# define index and add to the record, using data_index if available and turn into a list from csv
if "data_index" in record:
record["index"] = record["data_index"].split(",")
# same for data_sourcetype and sourcetype
if "data_sourcetype" in record:
record["sourcetype"] = record["data_sourcetype"].split(",")
# metric_category is called equally in record and blocklist, but it can be a list too
if "metric_category" in record:
record["metric_category"] = record["metric_category"].split(",")
# Check blocklist without regex
for _, rule in blocklist_not_regex.items():
object_category = rule.get("object_category")
if object_category in record and match_not_regex(record[object_category], rule):
return False # Match found in blocklist not regex, do not append
# Check blocklist with regex
for _, rule in blocklist_regex.items():
object_category = rule.get("object_category")
if object_category in record and match_regex(record[object_category], rule):
return False # Regex match found in blocklist, do not append
# before returning, remove index and sourcetype from the record
if "index" in record:
del record["index"]
if "sourcetype" in record:
del record["sourcetype"]
# turn metric_category is existing back to a comma separated string
if "metric_category" in record:
record["metric_category"] = ",".join(record["metric_category"])
return True # If no blocklist rules matched, append the record
def dsm_check_default_thresholds(record, trackme_conf):
"""
Verify that the record contains expected fields, if not or if they are None, set them to default values.
"""
# Define a dictionary for the DSM fields and their respective default values
fields_defaults = {
"data_max_delay_allowed": trackme_conf["splk_general"][
"splk_general_dsm_delay_default"
],
"data_max_lag_allowed": trackme_conf["splk_general"][
"splk_general_dsm_threshold_default"
],
"data_override_lagging_class": "false",
"allow_adaptive_delay": "true",
}
# Iterate through the fields to check their presence and value
for field, default_value in fields_defaults.items():
# Check if field is missing or explicitly set to None
if field not in record or record[field] is None:
record[field] = default_value
else:
# Additional checks for numeric fields to ensure they can be converted to float
if field in ["data_max_delay_allowed", "data_max_lag_allowed"]:
try:
# Attempt to convert to float to validate
record[field] = float(record[field])
except ValueError:
# Set to default if conversion fails
record[field] = default_value
def dhm_check_default_thresholds(record, trackme_conf):
"""
Verify that the record contains expected fields, if not or if they are None, set them to default values.
"""
# Define a dictionary for the DHM fields and their respective default values
fields_defaults = {
"data_max_delay_allowed": trackme_conf["splk_general"][
"splk_general_dhm_delay_default"
],
"data_max_lag_allowed": trackme_conf["splk_general"][
"splk_general_dhm_threshold_default"
],
"data_override_lagging_class": "false",
"allow_adaptive_delay": "true",
"splk_dhm_alerting_policy": "global_policy",
}
# Iterate through the fields to check their presence and value
for field, default_value in fields_defaults.items():
# Check if field is missing or explicitly set to None
if field not in record or record[field] is None:
record[field] = default_value
else:
# Additional checks for numeric fields to ensure they can be converted to float
if field in ["data_max_delay_allowed", "data_max_lag_allowed"]:
try:
# Attempt to convert to float to validate
record[field] = float(record[field])
except ValueError:
# Set to default if conversion fails
record[field] = default_value
def dynamic_priority_lookup(
key_value, priority_collection_keys, priority_collection_dict, record
):
"""
Updates record with dynamic priority information if key_value exists in priority_collection_keys.
"""
# get the value for priority_external and priority_reason
priority_external = record.get("priority_external", None)
priority_reason = record.get("priority_reason", "entity_managed")
# first, check the value of priority_updated, if does not exist in the record, set to 0 and update the record
try:
priority_updated = int(record["priority_updated"])
# valid option are 0 or 1, if not one of these, set to 0
if priority_updated not in [0, 1]:
priority_updated = 0
except Exception as e:
priority_updated = 0
# add to record as priority_updated
record["priority_updated"] = priority_updated
# priority policies always have precedence
if key_value in priority_collection_keys:
dynamic_priority_record = priority_collection_dict.get(key_value)
dynamic_priority = dynamic_priority_record.get("priority", None)
dynamic_priority_reason = dynamic_priority_record.get(
"priority_reason", "entity_managed"
)
# if we have a match, and a priority, then update the record, otherwise do nothing
if dynamic_priority:
# add to record as priority_policy_value
record["priority_policy_value"] = dynamic_priority
if priority_updated != 1:
record["priority"] = dynamic_priority
logging.debug(
f'match found applying dynamic priority="{dynamic_priority}" for object="{record.get("object")}", key="{key_value}", priority_reason="{dynamic_priority_reason}"'
)
else:
logging.debug(
f'priority_updated is set to 1, skipping dynamic priority="{dynamic_priority}" for object="{record.get("object")}", key="{key_value}", priority_reason="{dynamic_priority_reason}"'
)
record["priority_policy_id"] = dynamic_priority_reason
record["priority_reason"] = f"priority policy id: {dynamic_priority_reason}"
# no match, set to default reason
else:
# if priority_reason contains "priority policy id" but we have no match, then set to default
# otherwise, keep the existing value, it could be externally managed
# also check that the fields is in record first
if "priority_reason" in record:
if "priority policy id" in record["priority_reason"]:
record["priority_reason"] = "entity_managed"
else:
record["priority_reason"] = "entity_managed"
elif priority_external:
# attempt to get priority_reason, it not set, define to "externally_managed"
priority_reason = record.get("priority_reason", "externally_managed")
# if priority_external is in one of low, medium, high, critical, pending, then update
if priority_external in ["low", "medium", "high", "critical", "pending"]:
if priority_updated != 1:
record["priority"] = priority_external
logging.debug(
f'applying external priority="{priority_external}" for object="{record.get("object")}", priority_reason="{priority_reason}"'
)
else:
logging.debug(
f'priority_updated is set to 1, skipping external priority="{priority_external}" for object="{record.get("object")}", priority_reason="{priority_reason}"'
)
record["priority_reason"] = f"{priority_reason}"
else:
# if priority_external is not in one of low, medium, high, critical, pending, log a warning as we refused this value
logging.warning(
f'external priority="{priority_external}" for object="{record.get("object")}" is not in the list of allowed values, priority_reason="{priority_reason}"'
)
else:
# simply set priority_reason to the default value
record["priority_reason"] = "entity_managed"
logging.debug(
f'no match found for object="{record.get("object")}", key="{key_value}", priority_reason="{priority_reason}"'
)
def dynamic_tags_lookup(key_value, tags_collection_keys, tags_collection_dict, record):
"""
Updates record with dynamic tags information if key_value exists in tags_collection_keys.
"""
if key_value in tags_collection_keys:
dynamic_tags_record = tags_collection_dict.get(key_value)
dynamic_tags = dynamic_tags_record.get("tags_auto", None)
# if we have a match, then update the record, otherwise do nothing
if dynamic_tags:
record["tags_auto"] = dynamic_tags
logging.debug(
f'match found applying dynamic tags="{dynamic_tags}" for object="{record.get("object")}", key="{key_value}"'
)
def dynamic_sla_class_lookup(
key_value, sla_collection_keys, sla_collection_dict, record
):
"""
Updates record with dynamic priority information if key_value exists in priority_collection_keys.
"""
if key_value in sla_collection_keys:
dynamic_sla_record = sla_collection_dict.get(key_value)
dynamic_sla = dynamic_sla_record.get("sla_class", None)
# if we have a match, and a priority, then update the record, otherwise do nothing
if dynamic_sla:
record["sla_class"] = dynamic_sla
logging.debug(
f'match found applying dynamic sla_class="{dynamic_sla}" for object="{record.get("object")}", key="{key_value}"'
)
def get_sla_timer(record, sla_classes, sla_default_class):
"""
Calculates and render the sla_timer
"""
# a JSON object comntaining a summary for sla information for rendering purposes
sla_message_json = {}
# check sla_class (if not in the record, use sla_default_class)
sla_class = record.get("sla_class", None)
if not sla_class:
sla_class = sla_default_class
record["sla_class"] = sla_default_class
else:
# get sla_treshold from sla_classes, if the mentioned sla_class is not found in sla_classes, use the default and replace the record
if sla_class not in sla_classes:
sla_class = sla_default_class
record["sla_class"] = sla_default_class
# add to sla_message_json
sla_message_json["sla_class"] = sla_class
# get sla_threshold from sla_classes
try:
sla_threshold = int(sla_classes[sla_class]["sla_threshold"])
except Exception as e:
sla_threshold = 86400
# convert to sla_threshold_duration
record["sla_threshold_duration"] = convert_seconds_to_duration(sla_threshold)
# add to record
record["sla_threshold"] = sla_threshold
# add to sla_message_json
sla_message_json["object"] = record.get("object")
# Calculates
# for sla, we need to use the current object_state from the KVstore, rather than realtime calculate object_state
object_state = record.get("kvcurrent_object_state", "red")
# we will use the realtime object_state to manage a different SLA message if we detect that the KVstore object_state is not yet updated
realtime_object_state = record.get("object_state", "red")
sla_message_json["object_state"] = object_state
if object_state == "red":
try:
latest_flip_time = float(record.get("latest_flip_time", 0))
except Exception as e:
latest_flip_time = 0
if latest_flip_time > 0:
sla_timer = int(round(float(int(time.time()) - latest_flip_time), 0))
else:
sla_timer = 0
# add sla_timer
record["sla_timer"] = sla_timer
sla_message_json["sla_timer"] = sla_timer
# calculate sla_timer_duration
sla_timer_duration = convert_seconds_to_duration(sla_timer)
# add
record["sla_timer_duration"] = convert_seconds_to_duration(sla_timer)
sla_message_json["sla_timer_duration"] = sla_timer_duration
# SLA breached
sla_is_breached = 1 if sla_timer > sla_threshold else 0
record["sla_is_breached"] = sla_is_breached
sla_message_json["sla_is_breached"] = sla_is_breached
# add an sla_message
if sla_is_breached == 1:
sla_message = f"SLA breached, the entity has been in a red state for more than {convert_seconds_to_duration(sla_timer)} (sla_class: {sla_class}, sla_class_threshold: {convert_seconds_to_duration(sla_threshold)}, sla_timer_sec: {int(round(sla_timer, 0))} sec, sla_threshold_sec: {sla_threshold} sec)"
record["sla_message"] = sla_message
sla_message_json["sla_message"] = sla_message
else:
sla_message = f"SLA is not breached, the entity has been in a red state for {convert_seconds_to_duration(sla_timer)} (sla_class: {sla_class}, sla_class_threshold: {convert_seconds_to_duration(sla_threshold)}, sla_timer_sec: {int(round(sla_timer, 0))} sec, sla_threshold_sec: {sla_threshold} sec)"
record["sla_message"] = sla_message
sla_message_json["sla_message"] = sla_message
elif object_state == "green" and realtime_object_state == "red":
record["sla_timer"] = 0
sla_message_json["sla_timer"] = 0
record["sla_timer_duration"] = "0 sec"
sla_message_json["sla_timer_duration"] = "0 sec"
record["sla_is_breached"] = 0
sla_message_json["sla_is_breached"] = 0
record["sla_message"] = (
"SLA status refresh is pending, the realtime entity state is red and SLA will be reflected in the next minutes once the KVstore status is updated by trackers"
)
sla_message_json["sla_message"] = (
"SLA status refresh is pending, the realtime entity state is red and SLA will be reflected in the next minutes once the KVstore status is updated by trackers"
)
else:
record["sla_timer"] = 0
sla_message_json["sla_timer"] = 0
record["sla_timer_duration"] = "0 sec"
sla_message_json["sla_timer_duration"] = "0 sec"
record["sla_is_breached"] = 0
sla_message_json["sla_is_breached"] = 0
record["sla_message"] = "SLA is not breached, the entity is not in a red state"
sla_message_json["sla_message"] = (
"SLA is not breached, the entity is not in a red state"
)
# add sla_message_json to record
record["sla_message_json"] = sla_message_json
def flx_thresholds_lookup(object_value, key_value, record, thresholds_collection_dict):
"""
Updates record with dynamic thresholds information (field dynamic_thresholds) if key_value matches the object_id value in thresholds_collection_dict records.
ex:
{
"c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934": {
"metric_name": "soar.mem_used_pct",
"value": 80,
"operator": ">",
"condition_true": True,
"mtime": 1747012850.5604594,
"comment": "No comment for update.",
"object_id": "199fc4f889ff4946181bb00f56aad44c7580dd87691de699e1c0d2fc851a1ec5",
"_user": "nobody",
"_key": "c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934"
}
}
"""
dynamic_thresholds = {}
if thresholds_collection_dict:
for key, value in thresholds_collection_dict.items():
object_id = value.get("object_id", None)
if object_id and object_id == key_value:
# add the dynamic threshold record to the dynamic_thresholds dictionary
dynamic_thresholds[key] = thresholds_collection_dict[key]
# add the dynamic_thresholds dictionary to the record
record["dynamic_thresholds"] = dynamic_thresholds
return True
def flx_drilldown_searches_lookup(tenant_id, tracker_name, account, record, drilldown_searches_collection_dict):
if drilldown_searches_collection_dict:
# Helper function to expand tokens in drilldown_search string
def expand_tokens(search_string, record):
if not isinstance(search_string, str):
return search_string
result = ""
i = 0
while i < len(search_string):
if search_string[i] == "$":
# Look for closing $
end = search_string.find("$", i + 1)
if end != -1:
token = search_string[i + 1 : end]
# Handle $result.<token_name>$ format
if token.startswith("result."):
token_name = token[7:] # Remove "result." prefix
else:
token_name = token
# Replace if token_name exists in record
if token_name in record:
replacement = str(record[token_name])
result += replacement
else:
# No replacement, keep token as is
result += "$" + token + "$"
i = end + 1
else:
# No closing $, just add the rest
result += search_string[i:]
break
else:
result += search_string[i]
i += 1
return result
# Handle concurrent trackers: tracker_name can be a JSON array string, comma-separated string, or a simple string
tracker_names = []
if tracker_name:
if isinstance(tracker_name, str):
try:
# Try to parse as JSON array (concurrent tracker format from KVstore)
parsed_tracker_names = json.loads(tracker_name)
if isinstance(parsed_tracker_names, list):
tracker_names = parsed_tracker_names
else:
# Single tracker name as string
tracker_names = [tracker_name]
except (json.JSONDecodeError, TypeError):
# Not a JSON array, check if it's a comma-separated string (aggregated format)
if "," in tracker_name:
# Comma-separated string, split and strip whitespace
tracker_names = [tn.strip() for tn in tracker_name.split(",") if tn.strip()]
else:
# Single tracker name as string
tracker_names = [tracker_name]
elif isinstance(tracker_name, list):
# Already a list
tracker_names = tracker_name
else:
# Fallback: convert to string and treat as single tracker
tracker_names = [str(tracker_name)]
# Collect all drilldown searches from all matching trackers
drilldown_searches_list = []
for tn in tracker_names:
# Normalize tracker name using the dedicated function
normalized_tracker_name = normalize_flx_tracker_name(tenant_id, tn)
# Look up all matching entries in the collection
for key, value in drilldown_searches_collection_dict.items():
if value.get("tracker_name") == normalized_tracker_name:
# get drilldown_search, drilldown_search_earliest, drilldown_search_latest
drilldown_search = value.get("drilldown_search")
drilldown_search_earliest = value.get("drilldown_search_earliest")
drilldown_search_latest = value.get("drilldown_search_latest")
if drilldown_search:
# expand tokens in drilldown_search if it's a string
expanded_search = expand_tokens(drilldown_search, record)
# Add to list with tracker name for reference
drilldown_searches_list.append({
"drilldown_search": expanded_search,
"drilldown_search_earliest": drilldown_search_earliest or "-24h",
"drilldown_search_latest": drilldown_search_latest or "now",
"tracker_name": normalized_tracker_name, # Include tracker name for UI display
})
# Store drilldown searches as array for UI to iterate over
if drilldown_searches_list:
record["drilldown_searches"] = drilldown_searches_list
# For backward compatibility, also set the first drilldown search as single values
# This ensures existing UI code that expects drilldown_search, drilldown_search_earliest, drilldown_search_latest still works
first_drilldown = drilldown_searches_list[0]
record["drilldown_search"] = first_drilldown["drilldown_search"]
record["drilldown_search_earliest"] = first_drilldown["drilldown_search_earliest"]
record["drilldown_search_latest"] = first_drilldown["drilldown_search_latest"]
return True
return False
def flx_default_metrics_lookup(tenant_id, tracker_name, record, default_metrics_collection_dict):
if default_metrics_collection_dict:
# Handle concurrent trackers: tracker_name can be a JSON array string, comma-separated string, or a simple string
tracker_names = []
if tracker_name:
if isinstance(tracker_name, str):
try:
# Try to parse as JSON array (concurrent tracker format from KVstore)
parsed_tracker_names = json.loads(tracker_name)
if isinstance(parsed_tracker_names, list):
tracker_names = parsed_tracker_names
else:
# Single tracker name as string
tracker_names = [tracker_name]
except (json.JSONDecodeError, TypeError):
# Not a JSON array, check if it's a comma-separated string (aggregated format)
if "," in tracker_name:
# Comma-separated string, split and strip whitespace
tracker_names = [tn.strip() for tn in tracker_name.split(",") if tn.strip()]
else:
# Single tracker name as string
tracker_names = [tracker_name]
elif isinstance(tracker_name, list):
# Already a list
tracker_names = tracker_name
else:
# Fallback: convert to string and treat as single tracker
tracker_names = [str(tracker_name)]
# Normalize all tracker names and collect all matching metric names
metric_names = []
seen_metrics = set() # Track unique metrics to avoid duplicates
for tn in tracker_names:
# Normalize tracker name using the dedicated function
normalized_tracker_name = normalize_flx_tracker_name(tenant_id, tn)
# Look up all matching entries in the collection
for key, value in default_metrics_collection_dict.items():
if value.get("tracker_name") == normalized_tracker_name:
metric_name = value.get("metric_name")
if metric_name and metric_name not in seen_metrics:
metric_names.append(metric_name)
seen_metrics.add(metric_name)
# Set default_metric based on number of metrics found
if metric_names:
# If only one metric, keep as string for backward compatibility
# If multiple metrics, return as array for UI multi-select support
if len(metric_names) == 1:
record["default_metric"] = metric_names[0]
else:
record["default_metric"] = metric_names
return True
# set to status
record["default_metric"] = "status"
return False
def flx_check_dynamic_thresholds(logger, dynamic_thresholds, metrics_record):
"""
Checks if the dynamic thresholds are breached and updates the record accordingly.
Returns:
- threshold_alert: 1 if one or more thresholds are in alert, 0 otherwise
- threshold_messages: list of messages indicating which thresholds are in alert
- threshold_scores: list of scores for breached thresholds (defaults to 100 if not specified)
"""
ops = {
">": operator.gt,
"<": operator.lt,
"==": operator.eq,
"!=": operator.ne,
">=": operator.ge,
"<=": operator.le,
}
if not isinstance(metrics_record, dict):
try:
metrics_record = json.loads(metrics_record)
except Exception as e:
logger.error(
f'metrics_record="{metrics_record}" value can not be converted to dict, exception="{e}"'
)
return 0, [], []
logger.debug(
f'starting function flx_check_dynamic_thresholds, dynamic_thresholds="{json.dumps(dynamic_thresholds, indent=2)}", metrics_record="{json.dumps(metrics_record, indent=2)}"'
)
threshold_alert = 0
threshold_messages = []
threshold_scores = []
for threshold_key, threshold in dynamic_thresholds.items():
logger.debug(
f'checking threshold_key="{threshold_key}", threshold="{json.dumps(threshold, indent=2)}"'
)
metric_name = threshold.get("metric_name")
op_str = threshold.get("operator")
condition_true = strict_interpret_boolean(threshold.get("condition_true"))
# Get threshold score, default to 100 if not present (for backward compatibility with existing records)
try:
threshold_score = int(threshold.get("score", 100))
except (TypeError, ValueError):
threshold_score = 100
if metric_name not in metrics_record:
logger.debug(
f'function flx_check_dynamic_thresholds, metric_name="{metric_name}" not found in metrics_record="{json.dumps(metrics_record, indent=2)}", skipping threshold (metric may not be available for this tracker)'
)
continue
if op_str not in ops:
logger.error(
f'functionflx_check_dynamic_thresholds, op_str="{op_str}" not found in ops'
)
continue
# threshold value can be a string referencing a field in the metrics_record, or a proper numerical value
threshold_num_parsed = False
# first, try to load the threshold value as a float
try:
threshold_value = float(threshold.get("value"))
threshold_num_parsed = True
except (TypeError, ValueError):
pass
# if failed, try to load the threshold value from the field value referenced in the threshold value
if not threshold_num_parsed and threshold.get("value") in metrics_record:
try:
threshold_value = float(metrics_record.get(threshold.get("value")))
threshold_num_parsed = True
except (TypeError, ValueError):
pass
# if both failed, log a warning message and skip the threshold
if not threshold_num_parsed:
logger.warning(
f'function flx_check_dynamic_thresholds threshold_value value can not be converted to float, skipping threshold_record="{json.dumps(threshold, indent=2)}"'
)
continue
try:
metric_value = float(metrics_record.get(metric_name, 0))
except (TypeError, ValueError):
logger.error(
f'function flx_check_dynamic_thresholds metric_value value can not be converted to float, skipping threshold_key="{threshold_key}"'
)
continue # Skip if value can't be converted to float
op_func = ops[op_str]
match = op_func(metric_value, threshold_value)
# Fixed logic: alert if the expected condition is NOT matched
should_alert = (condition_true and not match) or (not condition_true and match)
if should_alert:
threshold_alert = 1
threshold_messages.append(
f"Threshold condition is in alert: "
f"metric='{metric_name}', value={metric_value}, "
f"threshold={threshold_value}, operator='{op_str}', "
f"condition_true={condition_true}"
)
threshold_scores.append(threshold_score)
logger.debug(
f"function flx_check_dynamic_thresholds, Checking threshold '{threshold_key}' on metric '{metric_name}': value={metric_value}, threshold={threshold_value}, operator='{op_str}', condition_true={condition_true}, match={match}, should_alert={should_alert}"
)
return threshold_alert, threshold_messages, threshold_scores
def fqm_thresholds_lookup(object_value, key_value, record, thresholds_collection_dict):
"""
Updates record with dynamic thresholds information (field dynamic_thresholds) if key_value matches the object_id value in thresholds_collection_dict records.
ex:
{
"c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934": {
"metric_name": "soar.mem_used_pct",
"value": 80,
"operator": ">",
"condition_true": True,
"mtime": 1747012850.5604594,
"comment": "No comment for update.",
"object_id": "199fc4f889ff4946181bb00f56aad44c7580dd87691de699e1c0d2fc851a1ec5",
"_user": "nobody",
"_key": "c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934"
}
}
"""
dynamic_thresholds = {}
if thresholds_collection_dict:
for key, value in thresholds_collection_dict.items():
object_id = value.get("object_id", None)
if object_id and object_id == key_value:
# add the dynamic threshold record to the dynamic_thresholds dictionary
dynamic_thresholds[key] = thresholds_collection_dict[key]
# add the dynamic_thresholds dictionary to the record
record["dynamic_thresholds"] = dynamic_thresholds
return True
def fqm_check_dynamic_thresholds(logger, dynamic_thresholds, metrics_record):
"""
Checks if the dynamic thresholds are breached and updates the record accordingly.
Returns:
- threshold_alert: 1 if one or more thresholds are in alert, 0 otherwise
- threshold_messages: list of messages indicating which thresholds are in alert
- threshold_scores: list of scores for breached thresholds (defaults to 100 if not specified)
"""
ops = {
">": operator.gt,
"<": operator.lt,
"==": operator.eq,
"!=": operator.ne,
">=": operator.ge,
"<=": operator.le,
}
if not isinstance(metrics_record, dict):
try:
metrics_record = json.loads(metrics_record)
except Exception as e:
logger.error(
f'metrics_record="{metrics_record}" value can not be converted to dict, exception="{e}"'
)
return 0, [], []
logger.debug(
f'starting function fqm_check_dynamic_thresholds, dynamic_thresholds="{json.dumps(dynamic_thresholds, indent=2)}", metrics_record="{json.dumps(metrics_record, indent=2)}"'
)
threshold_alert = 0
threshold_messages = []
threshold_scores = []
for threshold_key, threshold in dynamic_thresholds.items():
logger.debug(
f'checking threshold_key="{threshold_key}", threshold="{json.dumps(threshold, indent=2)}"'
)
metric_name = threshold.get("metric_name")
op_str = threshold.get("operator")
condition_true = strict_interpret_boolean(threshold.get("condition_true"))
# Get threshold score, default to 100 if not present (for backward compatibility with existing records)
try:
threshold_score = int(threshold.get("score", 100))
except (TypeError, ValueError):
threshold_score = 100
if metric_name not in metrics_record:
logger.debug(
f'function fqm_check_dynamic_thresholds, metric_name="{metric_name}" not found in metrics_record="{json.dumps(metrics_record, indent=2)}", skipping threshold (metric may not be available for this tracker)'
)
continue
if op_str not in ops:
logger.error(
f'functionfqm_check_dynamic_thresholds, op_str="{op_str}" not found in ops'
)
continue
try:
threshold_value = float(threshold.get("value"))
except (TypeError, ValueError):
logger.error(
f'function fqm_check_dynamic_thresholds threshold_value value can not be converted to float, skipping threshold_key="{threshold_key}"'
)
continue
try:
metric_value = float(metrics_record.get(metric_name, 0))
except (TypeError, ValueError):
logger.error(
f'function fqm_check_dynamic_thresholds metric_value value can not be converted to float, skipping threshold_key="{threshold_key}"'
)
continue # Skip if value can't be converted to float
op_func = ops[op_str]
match = op_func(metric_value, threshold_value)
# Fixed logic: alert if the expected condition is NOT matched
should_alert = (condition_true and not match) or (not condition_true and match)
if should_alert:
threshold_alert = 1
threshold_messages.append(
f"Threshold condition is in alert: "
f"metric='{metric_name}', value={metric_value}, "
f"threshold={threshold_value}, operator='{op_str}', "
f"condition_true={condition_true}"
)
threshold_scores.append(threshold_score)
logger.debug(
f"function fqm_check_dynamic_thresholds, Checking threshold '{threshold_key}' on metric '{metric_name}': value={metric_value}, threshold={threshold_value}, operator='{op_str}', condition_true={condition_true}, match={match}, should_alert={should_alert}"
)
return threshold_alert, threshold_messages, threshold_scores
def calculate_score(service, tenant_id, component):
"""
Calculates the score for each object_id based on outlier scoring metrics from the past 24 hours.
:param service: The Splunk service object.
:param tenant_id: The tenant ID to query scores for.
:param component: The component to query scores for.
:return: A dictionary keyed by object_id, where each value contains:
- score: The sum of scores for the object_id (float)
- score_outliers: The sum of scores for the object_id that are outliers (float)
- object: The object name (string)
- score_source: A list of scoring sources (list of strings)
"""
if not service:
logging.error('function calculate_score, service parameter is None or empty')
return {}
if not tenant_id:
logging.error('function calculate_score, tenant_id parameter is None or empty')
return {}
# Build the search query
search_query = remove_leading_spaces(
f"""
| mstats sum(trackme.scoring.score) as score where `trackme_metrics_idx({tenant_id})` tenant_id="{tenant_id}" object_category="{component}" by object_id, object, score_source
| eval score_outliers=if(match(score_source,"^(false_positive_outlier$|lowerbound_outlier|upperbound_outlier)"),score,null())
| stats sum(score) as score, sum(score_outliers) as score_outliers, values(score_source) as score_source by object_id, object
"""
)
# Search parameters for past 24 hours
kwargs_search = {
"earliest_time": "-24h",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
# Initialize the result dictionary
scores_dict = {}
start_time = time.time()
try:
logging.debug(
f'function calculate_score, tenant_id="{tenant_id}", component="{component}", search_query="{search_query}", kwargs_search="{json.dumps(kwargs_search, indent=2)}"'
)
# Execute the search
reader = run_splunk_search(
service,
search_query,
kwargs_search,
24, # max_retries
5, # sleep_time
)
# Process results
for item in reader:
if isinstance(item, dict):
object_id = item.get("object_id")
if object_id:
# Get score, defaulting to 0 if not present or invalid
try:
score = float(item.get("score", 0))
except (TypeError, ValueError):
score = 0.0
# Get score_outliers, defaulting to 0 if not present or invalid
try:
score_outliers = float(item.get("score_outliers", 0))
except (TypeError, ValueError):
score_outliers = 0.0
# Get object name
object_name = item.get("object", "")
# Get score_source - it may be a string or a list
score_source_raw = item.get("score_source")
if isinstance(score_source_raw, list):
score_source_list = score_source_raw
elif isinstance(score_source_raw, str):
score_source_list = [score_source_raw]
else:
score_source_list = []
# Store in dictionary
scores_dict[object_id] = {
"score": score,
"score_outliers": score_outliers,
"object": object_name,
"score_source": score_source_list,
}
runtime = round(time.time() - start_time, 3)
logging.debug(
f'function calculate_score, tenant_id="{tenant_id}", '
f'no_objects="{len(scores_dict)}", run_time="{runtime}"'
)
except Exception as e:
logging.error(
f'function calculate_score, tenant_id="{tenant_id}", '
f'failed with exception="{str(e)}"'
)
# Return empty dict on error
return {}
return scores_dict