You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
6460 lines
292 KiB
6460 lines
292 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
import ast
|
|
import json
|
|
import re
|
|
import operator
|
|
|
|
# Networking and URL handling imports
|
|
from urllib.parse import urlencode
|
|
import urllib3
|
|
|
|
# Disable insecure request warnings for urllib3
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# append lib
|
|
sys.path.append(os.path.join(splunkhome, "etc", "apps", "trackme", "lib"))
|
|
|
|
# import trackme libs
|
|
from trackme_libs import run_splunk_search
|
|
|
|
# Import trackme libs logicalgroup
|
|
from trackme_libs_logicalgroup import logical_group_update_green_red_members
|
|
|
|
# Import trackme libs utils
|
|
from trackme_libs_utils import strict_interpret_boolean, remove_leading_spaces
|
|
|
|
# Import trackme libs disruption queue
|
|
from trackme_libs_disruption_queue import (
|
|
disruption_queue_lookup,
|
|
disruption_queue_update,
|
|
disruption_queue_get_duration,
|
|
)
|
|
|
|
# Import trackme libs splk flx
|
|
from trackme_libs_splk_flx import normalize_flx_tracker_name
|
|
|
|
# Import collections data for default values
|
|
from collections_data import vtenant_account_default
|
|
|
|
# logging:
|
|
# To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition
|
|
# and rely on callers themselves
|
|
|
|
|
|
def get_anomaly_reason_from_component_type(component_type):
|
|
"""
|
|
Map score_definition component type to anomaly_reason value.
|
|
|
|
Args:
|
|
component_type: The type from score_definition.components (e.g., "future_tolerance_breach")
|
|
|
|
Returns:
|
|
The corresponding anomaly_reason value (e.g., "future_over_tolerance")
|
|
"""
|
|
mapping = {
|
|
"future_tolerance_breach": "future_over_tolerance",
|
|
"data_sampling_anomaly": "data_sampling_anomaly",
|
|
"delay_threshold_breach": "delay_threshold_breached",
|
|
"lag_threshold_breach": "lag_threshold_breached",
|
|
"latency_threshold_breach": "lag_threshold_breached",
|
|
"min_hosts_dcount_breach": "min_hosts_dcount",
|
|
"metric_alert": "metric_alert",
|
|
"inactive": "inactive",
|
|
"status_not_met": "status_not_met",
|
|
"skipping_searches": "skipping_searches",
|
|
"execution_errors": "execution_errors",
|
|
"orphan_search": "orphan_search",
|
|
"execution_delayed": "execution_delayed",
|
|
"out_of_monitoring_times": "out_of_monitoring_times",
|
|
"ml_outliers_detection": "ml_outliers_detection",
|
|
"manual_score": "score_breached",
|
|
"score_breached": "score_breached",
|
|
}
|
|
return mapping.get(component_type, component_type)
|
|
|
|
|
|
def get_impact_score(vtenant_account, field_name, default_value):
|
|
"""
|
|
Helper function to get impact score from vtenant_account with fallback to default.
|
|
|
|
Args:
|
|
vtenant_account: Dictionary containing virtual tenant account configuration
|
|
field_name: Name of the impact score field to retrieve
|
|
default_value: Default value to use if field is not found
|
|
|
|
Returns:
|
|
Integer impact score value
|
|
"""
|
|
if vtenant_account and isinstance(vtenant_account, dict):
|
|
value = vtenant_account.get(field_name)
|
|
if value is not None:
|
|
try:
|
|
return int(value)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
# Fallback to vtenant_account_default if available
|
|
default = vtenant_account_default.get(field_name, default_value)
|
|
try:
|
|
return int(default)
|
|
except (ValueError, TypeError):
|
|
return default_value
|
|
|
|
|
|
def get_entity_impact_score(record, component, score_type, vtenant_account, default_value):
|
|
"""
|
|
Helper function to get impact score with entity-level override support.
|
|
Checks entity-level impact_score_weights first, then falls back to tenant-level configuration.
|
|
|
|
Args:
|
|
record: Dictionary containing entity record data (may contain impact_score_weights)
|
|
component: Component type ('dsm' or 'dhm')
|
|
score_type: Score type ('delay' or 'latency')
|
|
vtenant_account: Dictionary containing virtual tenant account configuration
|
|
default_value: Default value to use if no override is found
|
|
|
|
Returns:
|
|
Integer impact score value
|
|
"""
|
|
# First, check for entity-level custom impact score weights
|
|
if record and isinstance(record, dict):
|
|
impact_score_weights = record.get("impact_score_weights")
|
|
if impact_score_weights:
|
|
try:
|
|
# Parse JSON if it's a string
|
|
if isinstance(impact_score_weights, str):
|
|
weights_dict = json.loads(impact_score_weights)
|
|
elif isinstance(impact_score_weights, dict):
|
|
weights_dict = impact_score_weights
|
|
else:
|
|
weights_dict = None
|
|
|
|
# Check if we have a custom weight for this score type
|
|
if weights_dict and isinstance(weights_dict, dict):
|
|
custom_weight = weights_dict.get(score_type)
|
|
if custom_weight is not None:
|
|
try:
|
|
return int(custom_weight)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
except (json.JSONDecodeError, AttributeError, TypeError):
|
|
# If parsing fails, fall through to tenant-level
|
|
pass
|
|
|
|
# Fall back to tenant-level configuration
|
|
field_name = f"impact_score_{component}_{score_type}_threshold_breach"
|
|
return get_impact_score(vtenant_account, field_name, default_value)
|
|
|
|
|
|
def parse_filters(query_parameters):
|
|
filters = []
|
|
for key, value in query_parameters.items():
|
|
if "filter[" in key:
|
|
parts = key.split("[")
|
|
index = int(parts[1].split("]")[0])
|
|
prop = parts[2].split("]")[0]
|
|
|
|
# Initialize filter dict if it doesn't exist
|
|
while len(filters) <= index:
|
|
filters.append({})
|
|
|
|
if "value" in key and len(parts) > 3:
|
|
# Handle list values
|
|
if "value" not in filters[index]:
|
|
filters[index]["value"] = []
|
|
# Convert list index (e.g., filter[0][value][0]) to int and insert value
|
|
list_index = int(parts[3].split("]")[0])
|
|
# Ensure the list is long enough to hold this index
|
|
while len(filters[index]["value"]) <= list_index:
|
|
filters[index]["value"].append(None)
|
|
filters[index]["value"][list_index] = (
|
|
value.lower() if isinstance(value, str) else value
|
|
)
|
|
else:
|
|
# Handle non-list values or the property itself
|
|
filters[index][prop] = (
|
|
value.lower() if isinstance(value, str) else value
|
|
)
|
|
return filters
|
|
|
|
|
|
def record_matches_filter(record, filter):
|
|
field = filter.get("field")
|
|
filter_type = filter.get("type")
|
|
value = filter.get("value")
|
|
|
|
# Immediately return True if the filter value is empty
|
|
if value == "":
|
|
return True
|
|
|
|
# Try to interpret the value as a JSON list if it looks like one
|
|
if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
|
|
try:
|
|
value = json.loads(value)
|
|
except json.JSONDecodeError:
|
|
pass # If decoding fails, proceed with the original value string
|
|
|
|
# Prepare the record value for comparison
|
|
record_value = record.get(field, "")
|
|
if (
|
|
isinstance(record_value, str)
|
|
and "|" in record_value
|
|
and field not in ["alias", "object"]
|
|
):
|
|
# Treat as a pseudo list if record_value contains pipes
|
|
record_value = [item.strip().lower() for item in record_value.split("|")]
|
|
elif isinstance(record_value, str):
|
|
record_value = record_value.strip().lower()
|
|
|
|
if isinstance(value, str):
|
|
value = value.strip().lower()
|
|
|
|
# Handling for different filter types when record_value is a list
|
|
if isinstance(record_value, list):
|
|
if filter_type == "like":
|
|
if isinstance(value, list):
|
|
return any(v.lower() in item for item in record_value for v in value)
|
|
else:
|
|
return any(value in item for item in record_value)
|
|
elif filter_type == "=":
|
|
if isinstance(value, list):
|
|
return any(item == v.lower() for item in record_value for v in value)
|
|
else:
|
|
return value in record_value
|
|
# we can accept != as a filter
|
|
elif filter_type == "!=":
|
|
if isinstance(value, list):
|
|
return any(item != v.lower() for item in record_value for v in value)
|
|
else:
|
|
return value not in record_value
|
|
elif filter_type in ("<", "<=", ">", ">="):
|
|
# Numerical comparisons for lists are more complex and context-dependent
|
|
# You might want to reconsider how these should behave with list record_values
|
|
return False
|
|
elif filter_type == "in":
|
|
if isinstance(value, list):
|
|
return any(item in [v.lower() for v in value] for item in record_value)
|
|
else:
|
|
return value in record_value
|
|
elif filter_type == "starts":
|
|
return any(item.startswith(value) for item in record_value)
|
|
elif filter_type == "ends":
|
|
return any(item.endswith(value) for item in record_value)
|
|
elif filter_type == "regex":
|
|
return any(re.search(value, item) is not None for item in record_value)
|
|
else:
|
|
# Handling for different filter types when record_value is a string
|
|
if filter_type == "like":
|
|
return value in record_value
|
|
elif filter_type == "=":
|
|
return record_value == value
|
|
# numerical comparison (except for != which is handled as a string comparison)
|
|
elif filter_type in ("<", "<=", ">", ">=", "!="):
|
|
# Handle numerical comparisons including "!="
|
|
if filter_type == "!=":
|
|
try:
|
|
# Attempt numerical comparison first
|
|
is_not_equal = float(record_value) != float(value)
|
|
except ValueError:
|
|
# Fallback to string comparison
|
|
is_not_equal = record_value != value
|
|
return is_not_equal
|
|
else:
|
|
try:
|
|
record_value = float(record_value)
|
|
value = float(value)
|
|
except ValueError:
|
|
return False # Skip filter if conversion fails
|
|
# Directly evaluate the comparison expression
|
|
return eval(f"record_value {filter_type} value")
|
|
elif filter_type == "in":
|
|
if isinstance(value, list):
|
|
return record_value in [v.lower() for v in value]
|
|
else:
|
|
return record_value == value
|
|
elif filter_type == "starts":
|
|
return record_value.startswith(value)
|
|
elif filter_type == "ends":
|
|
return record_value.endswith(value)
|
|
elif filter_type == "regex":
|
|
return re.search(value, record_value) is not None
|
|
|
|
return False
|
|
|
|
|
|
def pre_filter_records(data_records, query_parameters):
|
|
"""
|
|
Pre-filters records based on a subset of filter fields: 'alias', 'object', 'monitored_state'.
|
|
If other fields are present in the filters, returns all records without filtering.
|
|
"""
|
|
filters = parse_filters(query_parameters)
|
|
prefilter_fields = {"alias", "object", "monitored_state"}
|
|
|
|
# Check if any filter exists outside the pre-defined fields for pre-filtering
|
|
if any(f.get("field") not in prefilter_fields for f in filters):
|
|
# If there are filters on fields outside the pre-filter scope, return all records
|
|
return data_records
|
|
|
|
# Proceed with pre-filtering if all filters fall within the pre-filter scope
|
|
prefiltered_records = [
|
|
record
|
|
for record in data_records
|
|
if all(record_matches_filter(record, f) for f in filters)
|
|
]
|
|
|
|
return prefiltered_records
|
|
|
|
|
|
def filter_records(data_records, query_parameters):
|
|
"""
|
|
Filters data records based on structured filters parsed from query parameters.
|
|
"""
|
|
filters = parse_filters(query_parameters)
|
|
|
|
if len(filters) > 0:
|
|
logging.debug(f'filters="{filters}"')
|
|
|
|
# Apply filters to records, requiring all conditions to be met ('AND' logic)
|
|
filtered_records = [
|
|
record
|
|
for record in data_records
|
|
if all(record_matches_filter(record, f) for f in filters)
|
|
]
|
|
|
|
return filtered_records
|
|
|
|
|
|
def convert_seconds_to_duration(seconds):
|
|
"""
|
|
Define the function convert_seconds_to_duration
|
|
behaviour: converts seconds to duration, duration is a string from as [D+]HH:MM:SS
|
|
The first segment represents the number of days, the second the number of hours, third the number of minutes, and the fourth the number of seconds.
|
|
"""
|
|
|
|
try:
|
|
original_seconds = int(seconds)
|
|
except ValueError:
|
|
return 0
|
|
|
|
# Check if the original seconds were negative
|
|
is_negative = original_seconds < 0
|
|
seconds = abs(original_seconds)
|
|
|
|
# Calculate days, hours, minutes, and seconds
|
|
days = seconds // (24 * 3600)
|
|
seconds = seconds % (24 * 3600)
|
|
hours = seconds // 3600
|
|
seconds %= 3600
|
|
minutes = seconds // 60
|
|
seconds %= 60
|
|
|
|
# Format the duration string
|
|
if days > 0:
|
|
duration = f"{days}+{hours:02d}:{minutes:02d}:{seconds:02d}"
|
|
else:
|
|
duration = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
|
|
|
# Add "-" if the original seconds were negative
|
|
if is_negative:
|
|
duration = "-" + duration
|
|
|
|
return duration
|
|
|
|
|
|
def convert_epoch_to_datetime(epoch):
|
|
"""
|
|
Define the function convert_epoch_to_datetime
|
|
"""
|
|
|
|
# convert epoch to float
|
|
try:
|
|
epoch = float(epoch)
|
|
# convert epoch to datetime
|
|
datetime = time.strftime("%d %b %Y %H:%M", time.localtime(epoch))
|
|
return datetime
|
|
except Exception as e:
|
|
epoch = 0
|
|
return epoch
|
|
|
|
|
|
def get_monitoring_time_status(monitoring_time_policy, monitoring_time_rules):
|
|
"""
|
|
Determine if an entity is currently under monitoring based on monitoring_time_policy and monitoring_time_rules.
|
|
|
|
Arguments:
|
|
- monitoring_time_policy: predefined policy name (string/list) or dictionary format
|
|
- monitoring_time_rules: dictionary with week day keys (0-6) and hour lists as values
|
|
|
|
Returns:
|
|
- (isUnderMonitoring, anomaly_reason, status_message) tuple
|
|
- isUnderMonitoring: True if entity is currently under monitoring, False otherwise
|
|
- anomaly_reason: "out_of_monitoring_times" if not under monitoring, None otherwise
|
|
- status_message: Human-readable message describing the monitoring status
|
|
"""
|
|
|
|
try:
|
|
import json
|
|
|
|
# Helper function to convert day number to day name
|
|
def get_day_name(day_no):
|
|
day_names = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
|
|
return day_names[day_no] if 0 <= day_no <= 6 else f"Day {day_no}"
|
|
|
|
# Helper function to format time in human-readable format
|
|
def format_time(hour_decimal):
|
|
hour_int = int(hour_decimal)
|
|
minutes = int((hour_decimal - hour_int) * 60)
|
|
if minutes == 0:
|
|
return f"{hour_int:02d}:00"
|
|
else:
|
|
return f"{hour_int:02d}:{minutes:02d}"
|
|
|
|
# get current wday (0=Sunday, 6=Saturday)
|
|
current_wday_no = int(time.strftime("%w"))
|
|
current_day_name = get_day_name(current_wday_no)
|
|
|
|
# get current hour and minute for precise checking
|
|
current_hour = int(time.strftime("%H"))
|
|
current_minute = int(time.strftime("%M"))
|
|
current_hour_decimal = current_hour + (current_minute / 60.0)
|
|
current_time_str = format_time(current_hour_decimal)
|
|
|
|
# Priority: monitoring_time_rules > monitoring_time_policy > all_time
|
|
|
|
# Check monitoring_time_rules first (takes precedence)
|
|
if monitoring_time_rules is not None and monitoring_time_rules != "":
|
|
try:
|
|
# Parse if it's a string
|
|
if isinstance(monitoring_time_rules, str):
|
|
rules_dict = json.loads(monitoring_time_rules)
|
|
else:
|
|
rules_dict = monitoring_time_rules
|
|
|
|
if isinstance(rules_dict, dict) and len(rules_dict) > 0:
|
|
# Check if current day is in the rules
|
|
day_key = str(current_wday_no)
|
|
if day_key in rules_dict:
|
|
hours_list = rules_dict[day_key]
|
|
if isinstance(hours_list, list) and len(hours_list) > 0:
|
|
# Check if current hour is in the list
|
|
for hour_val in hours_list:
|
|
try:
|
|
hour_float = float(hour_val)
|
|
# Check if current hour matches (within the hour range)
|
|
if hour_float <= current_hour_decimal < hour_float + 1:
|
|
return (
|
|
True,
|
|
None,
|
|
f"This entity is currently under monitoring (custom rules: {current_day_name} {current_time_str})"
|
|
)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
# Current day/hour not in rules
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (custom rules: {current_day_name} {current_time_str} is not within the configured monitoring schedule)"
|
|
)
|
|
except Exception as e:
|
|
logging.warning(f"Failed to parse monitoring_time_rules: {str(e)}, falling back to policy")
|
|
|
|
# Check monitoring_time_policy
|
|
if monitoring_time_policy is not None and monitoring_time_policy != "":
|
|
try:
|
|
# Parse if it's a string
|
|
if isinstance(monitoring_time_policy, str):
|
|
# Try to parse as JSON first (might be dictionary)
|
|
try:
|
|
policy_dict = json.loads(monitoring_time_policy)
|
|
except (json.JSONDecodeError, ValueError):
|
|
# Not JSON, treat as predefined policy name
|
|
policy_dict = None
|
|
policy_name = monitoring_time_policy
|
|
elif isinstance(monitoring_time_policy, list):
|
|
# List of policy names - use first one
|
|
policy_name = monitoring_time_policy[0] if len(monitoring_time_policy) > 0 else None
|
|
policy_dict = None
|
|
elif isinstance(monitoring_time_policy, dict):
|
|
policy_dict = monitoring_time_policy
|
|
policy_name = None
|
|
else:
|
|
policy_dict = None
|
|
policy_name = None
|
|
|
|
# If dictionary format, use it like monitoring_time_rules
|
|
if policy_dict is not None and isinstance(policy_dict, dict) and len(policy_dict) > 0:
|
|
day_key = str(current_wday_no)
|
|
if day_key in policy_dict:
|
|
hours_list = policy_dict[day_key]
|
|
if isinstance(hours_list, list) and len(hours_list) > 0:
|
|
for hour_val in hours_list:
|
|
try:
|
|
hour_float = float(hour_val)
|
|
if hour_float <= current_hour_decimal < hour_float + 1:
|
|
return (
|
|
True,
|
|
None,
|
|
f"This entity is currently under monitoring (custom policy: {current_day_name} {current_time_str})"
|
|
)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (custom policy: {current_day_name} {current_time_str} is not within the configured monitoring schedule)"
|
|
)
|
|
|
|
# Map predefined policy names to day+hour rules
|
|
if policy_name:
|
|
if policy_name == "all_time":
|
|
return (True, None, "This entity is currently under monitoring (all_time policy)")
|
|
|
|
elif policy_name == "business_days_all_hours":
|
|
# Monday-Friday (1-5), all hours
|
|
if current_wday_no in [1, 2, 3, 4, 5]:
|
|
return (True, None, f"This entity is currently under monitoring (business_days_all_hours policy: {current_day_name})")
|
|
else:
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (business_days_all_hours policy: {current_day_name} is not a business day)"
|
|
)
|
|
|
|
elif policy_name == "monday_saturday_all_hours":
|
|
# Monday-Saturday (1-6), all hours
|
|
if current_wday_no in [1, 2, 3, 4, 5, 6]:
|
|
return (True, None, f"This entity is currently under monitoring (monday_saturday_all_hours policy: {current_day_name})")
|
|
else:
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (monday_saturday_all_hours policy: {current_day_name} is not within Monday-Saturday)"
|
|
)
|
|
|
|
elif policy_name == "business_days_08h_20h":
|
|
# Monday-Friday (1-5), 8:00-20:00
|
|
if current_wday_no in [1, 2, 3, 4, 5]:
|
|
if 8 <= current_hour < 20:
|
|
return (True, None, f"This entity is currently under monitoring (business_days_08h_20h policy: {current_day_name} {current_time_str})")
|
|
else:
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (business_days_08h_20h policy: {current_day_name} {current_time_str} is outside 08:00-20:00 range)"
|
|
)
|
|
else:
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (business_days_08h_20h policy: {current_day_name} is not a business day)"
|
|
)
|
|
|
|
elif policy_name == "monday_saturday_08h_20h":
|
|
# Monday-Saturday (1-6), 8:00-20:00
|
|
if current_wday_no in [1, 2, 3, 4, 5, 6]:
|
|
if 8 <= current_hour < 20:
|
|
return (True, None, f"This entity is currently under monitoring (monday_saturday_08h_20h policy: {current_day_name} {current_time_str})")
|
|
else:
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (monday_saturday_08h_20h policy: {current_day_name} {current_time_str} is outside 08:00-20:00 range)"
|
|
)
|
|
else:
|
|
return (
|
|
False,
|
|
"out_of_monitoring_times",
|
|
f"This entity is not currently under monitoring (monday_saturday_08h_20h policy: {current_day_name} is not within Monday-Saturday)"
|
|
)
|
|
except Exception as e:
|
|
logging.warning(f"Failed to parse monitoring_time_policy: {str(e)}, falling back to all_time")
|
|
|
|
# Final fallback: all_time (monitor always)
|
|
return (True, None, "This entity is currently under monitoring (all_time policy)")
|
|
|
|
except Exception as e:
|
|
logging.error(f"get_monitoring_time_status function has failed, exception={str(e)}")
|
|
# Fallback to all_time on error
|
|
return (True, None, f"Monitoring time status check failed: {str(e)}, defaulting to all_time monitoring")
|
|
|
|
|
|
def get_outliers_status(isOutlier, OutliersDisabled, tenant_outliers_set_state=None, score_outliers=None):
|
|
"""
|
|
Create a function called get_outliers_status:
|
|
- arguments: isOutlier, OutliersDisabled, tenant_outliers_set_state (deprecated, kept for backward compatibility), score_outliers
|
|
- tenant_outliers_set_state: Deprecated - no longer used with score-based approach. Kept for backward compatibility.
|
|
- score_outliers: The score_outliers value from calculate_score (optional, for hybrid scoring)
|
|
- behaviour: alter isOutlier, 0=not outlier, 1=outlier (can turn red), 2=outlier but disabled/score too low
|
|
- With score-based approach, score_outliers controls whether outliers can turn entities red (score >= 100)
|
|
"""
|
|
|
|
if OutliersDisabled == 1:
|
|
isOutlier = 0
|
|
else:
|
|
if isOutlier == 1:
|
|
# Score-based approach: if score_outliers is provided and < 100, don't allow outlier to turn red
|
|
if score_outliers is not None:
|
|
if score_outliers >= 100:
|
|
isOutlier = 1 # Allow outlier status to turn entity red
|
|
else:
|
|
isOutlier = 2 # Report outlier but don't allow it to turn entity red
|
|
else:
|
|
isOutlier = 1 # Legacy behavior: allow outlier status if score not available
|
|
else:
|
|
isOutlier = 0
|
|
|
|
return isOutlier
|
|
|
|
|
|
def get_data_sampling_status(
|
|
data_sample_status_colour, data_sample_feature, tenant_data_sampling_set_state=None
|
|
):
|
|
"""
|
|
Create a function called get_data_sampling_status:
|
|
- arguments: data_sample_status_colour, data_sample_feature, tenant_data_sampling_set_state (deprecated, kept for backward compatibility)
|
|
- tenant_data_sampling_set_state: Deprecated - no longer used with score-based approach. Kept for backward compatibility.
|
|
- behaviour: alter isAnomaly, 0=not anomaly, 1=anomaly, 2=anomaly but disabled at tenant level
|
|
- With score-based approach, the impact score controls whether sampling anomalies affect entity status
|
|
"""
|
|
|
|
# if disabled at entity level, them isAnomaly is 0
|
|
if data_sample_feature == "disabled":
|
|
isAnomaly = 0
|
|
|
|
if data_sample_status_colour == "green":
|
|
isAnomaly = 0
|
|
elif data_sample_status_colour == "red":
|
|
# With score-based approach, score controls the impact, so always allow anomaly
|
|
# The score will determine if entity turns red (score >= 100)
|
|
isAnomaly = 1
|
|
else:
|
|
isAnomaly = 0
|
|
|
|
return isAnomaly
|
|
|
|
|
|
def get_future_status(
|
|
future_tolerance,
|
|
system_future_tolerance,
|
|
data_last_lag_seen,
|
|
data_last_ingestion_lag_seen,
|
|
data_last_time_seen,
|
|
data_last_ingest,
|
|
):
|
|
"""
|
|
Create a function called get_future_status:
|
|
- arguments: future_tolerance (expressed in seconds), system_future_tolerance (expressed in seconds), data_last_lag_seen (expressed in seconds)
|
|
- behaviour: returns a boolean, True if data_last_lag_seen is lower than future_tolerance, False otherwise. If future_tolerance is 0, rely on system_future_tolerance
|
|
"""
|
|
|
|
isFuture = False
|
|
isFutureMsg = ""
|
|
if future_tolerance == 0:
|
|
future_tolerance = system_future_tolerance
|
|
|
|
# convert all to int
|
|
try:
|
|
future_tolerance = int(round(float(future_tolerance), 0))
|
|
data_last_lag_seen = int(round(float(data_last_lag_seen), 0))
|
|
data_last_ingestion_lag_seen = int(
|
|
round(float(data_last_ingestion_lag_seen), 0)
|
|
)
|
|
except:
|
|
pass
|
|
|
|
logging.debug(
|
|
f"data_last_lag_seen={data_last_lag_seen}, system_future_tolerance={system_future_tolerance}, future_tolerance={future_tolerance}"
|
|
)
|
|
|
|
if float(data_last_lag_seen) < float(future_tolerance) or float(
|
|
data_last_ingestion_lag_seen
|
|
) < float(future_tolerance):
|
|
isFuture = True
|
|
|
|
# convert data_last_lag_seen to duration
|
|
data_last_lag_seen_duration = convert_seconds_to_duration(data_last_lag_seen)
|
|
|
|
# convert data_last_ingestion_lag_seen to duration
|
|
data_last_ingestion_lag_seen_duration = convert_seconds_to_duration(
|
|
data_last_ingestion_lag_seen
|
|
)
|
|
|
|
# convert data_last_time_seen to %c
|
|
data_last_time_seen_datetime = convert_epoch_to_datetime(data_last_time_seen)
|
|
|
|
# convert data_last_ingest to %c
|
|
data_last_ingest_datetime = convert_epoch_to_datetime(data_last_ingest)
|
|
|
|
isFutureMsg = f"""detected data indexed in the future which is most likely due to timestamping misconfiguration, timezone or time synchronization issue. Event delay is {data_last_lag_seen} seconds (duration: {data_last_lag_seen_duration}), Event latency is {data_last_ingestion_lag_seen} seconds (duration: {data_last_ingestion_lag_seen_duration}), this is beyond current tolerance threshold of {future_tolerance} seconds, latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event ingested for this entity: {data_last_ingest_datetime}. Review and fix the root cause, or adapt the future tolerance at the system level or for this entity especially."""
|
|
|
|
else:
|
|
isFuture = False
|
|
|
|
return isFuture, isFutureMsg, future_tolerance
|
|
|
|
|
|
def get_future_metrics_status(
|
|
system_future_tolerance,
|
|
metric_last_time_seen,
|
|
):
|
|
"""
|
|
Create a function called get_future_metrics_status:
|
|
- arguments: future_tolerance (expressed in seconds), system_future_tolerance (expressed in seconds), data_last_lag_seen (expressed in seconds)
|
|
- behaviour: returns a boolean, True if metric_last_time_seen is lower than system_future_tolerance, False otherwise.
|
|
"""
|
|
|
|
isFuture = False
|
|
isFutureMsg = ""
|
|
|
|
logging.debug(
|
|
f"metric_last_time_seen={metric_last_time_seen}, system_future_tolerance={system_future_tolerance}"
|
|
)
|
|
|
|
if float(metric_last_time_seen) < float(system_future_tolerance):
|
|
isFuture = True
|
|
# convert data_last_lag_seen to duration
|
|
metric_last_time_seen_duration = convert_seconds_to_duration(
|
|
metric_last_time_seen
|
|
)
|
|
# convert data_last_time_seen to %c
|
|
metric_last_time_seen_datetime = convert_epoch_to_datetime(
|
|
metric_last_time_seen
|
|
)
|
|
|
|
isFutureMsg = f"""detected metrics indexed in the future which is most likely due to timestamping misconfiguration, timezone or time synchronization issue. Metric delay is {metric_last_time_seen} seconds (duration: {metric_last_time_seen_duration}) which is beyond tolerance threshold of {system_future_tolerance}, latest event available (_time) for this entity: {metric_last_time_seen_datetime}. Review and fix the root cause, or adapt the future tolerance at the system level or for this entity especially"""
|
|
else:
|
|
isFuture = False
|
|
|
|
return isFuture, isFutureMsg
|
|
|
|
|
|
def get_is_under_dcount_host(min_dcount_host, min_dcount_threshold, min_dcount_field):
|
|
"""
|
|
Create a function call get_is_under_dcount_host:
|
|
- arguments: min_dcount_host, min_dcount_threshold
|
|
- returns: isUnderDcountHost (boolean), isUnderDcountHostMsg (string)
|
|
- behaviour: returns a boolean, True if min_dcount_threshold is a numerical and is lower than min_dcount_host, False otherwise
|
|
"""
|
|
|
|
if isinstance(min_dcount_host, float) and min_dcount_threshold < min_dcount_host:
|
|
isUnderDcountHost = True
|
|
isUnderDcountHostMsg = f"""Monitoring conditions are not met due to low number of hosts. Number of hosts is {int(min_dcount_threshold)} based on the metric {min_dcount_field} which is lower than the minimum required number of hosts of {int(min_dcount_host)}"""
|
|
else:
|
|
isUnderDcountHost = False
|
|
isUnderDcountHostMsg = ""
|
|
|
|
return isUnderDcountHost, isUnderDcountHostMsg
|
|
|
|
|
|
def get_logical_groups_collection_records(collection):
|
|
"""
|
|
Queries and processes records from a collection based on specific criteria.
|
|
|
|
:param collection: The collection object to query.
|
|
:return: Tuple containing collection records and a dictionary of records.
|
|
"""
|
|
|
|
collection_records = []
|
|
collection_records_dict = {}
|
|
count_to_process_list = []
|
|
collection_members_list = []
|
|
collection_members_dict = {}
|
|
|
|
end = False
|
|
skip_tracker = 0
|
|
while not end:
|
|
process_collection_records = collection.data.query(skip=skip_tracker)
|
|
if process_collection_records:
|
|
for item in process_collection_records:
|
|
collection_records.append(item)
|
|
collection_records_dict[item.get("_key")] = {
|
|
"object_group_name": item.get("object_group_name"),
|
|
"object_group_mtime": item.get("object_group_mtime"),
|
|
"object_group_members": item.get("object_group_members", []),
|
|
"object_group_members_green": item.get(
|
|
"object_group_members_green", []
|
|
),
|
|
"object_group_members_red": item.get(
|
|
"object_group_members_red", []
|
|
),
|
|
"object_group_min_green_percent": item.get(
|
|
"object_group_min_green_percent", 0
|
|
),
|
|
}
|
|
|
|
try:
|
|
|
|
logicalgroup_members = item.get("object_group_members", [])
|
|
# add members in collection_members_list, also create a dict per member
|
|
for member in logicalgroup_members:
|
|
if member not in collection_members_list:
|
|
collection_members_list.append(member)
|
|
collection_members_dict[member] = {
|
|
"object_group_key": item.get("_key"),
|
|
"object_group_name": item.get("object_group_name"),
|
|
"object_group_members": item.get(
|
|
"object_group_members", []
|
|
),
|
|
"object_group_members_green": item.get(
|
|
"object_group_members_green", []
|
|
),
|
|
"object_group_members_red": item.get(
|
|
"object_group_members_red", []
|
|
),
|
|
"object_group_min_green_percent": item.get(
|
|
"object_group_min_green_percent", 0
|
|
),
|
|
}
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f"function get_logical_groups_collection_records, error while processing logical group members, exception={str(e)}"
|
|
)
|
|
|
|
count_to_process_list.append(item.get("_key"))
|
|
skip_tracker += 5000
|
|
else:
|
|
end = True
|
|
|
|
return (
|
|
collection_records,
|
|
collection_records_dict,
|
|
collection_members_list,
|
|
collection_members_dict,
|
|
count_to_process_list,
|
|
)
|
|
|
|
|
|
def get_and_manage_logical_group_status(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
object_name,
|
|
object_state,
|
|
object_group_key,
|
|
object_logical_group_dict,
|
|
):
|
|
"""
|
|
Create a function called get_and_manage_logical_group_status:
|
|
- arguments: object_name, object_state, object_group_key, object_logical_group_dict
|
|
- returns: isUnderLogicalGroup (boolean), LogicalGroupStateInAlert (boolean), LogicalGroupMsg (string)
|
|
- behaviour:
|
|
isUnderLogicalGroup: True if object_group_key is not empty and object_group_members_count is higher than 1
|
|
LogicalGroupStateInAlert: True if object_group_green_percent is lower than object_group_min_green_percent
|
|
LogicalGroupMsg: string containing the status of the logical group
|
|
"""
|
|
|
|
# object_group_members_count
|
|
object_group_members_count = 0
|
|
|
|
# get logical group name
|
|
object_group_name = object_logical_group_dict.get("object_group_name", "")
|
|
|
|
try:
|
|
|
|
# enter if the group is not empty
|
|
if object_group_name != "":
|
|
object_group_min_green_percent = object_logical_group_dict.get(
|
|
"object_group_min_green_percent", 0
|
|
)
|
|
|
|
object_group_members = object_logical_group_dict.get(
|
|
"object_group_members", []
|
|
)
|
|
try:
|
|
object_group_members_count = len(object_group_members)
|
|
except:
|
|
object_group_members_count = 0
|
|
|
|
# if not a list and is a string, convert to list
|
|
if isinstance(object_group_members, str):
|
|
object_group_members = [object_group_members]
|
|
|
|
object_group_members_green = object_logical_group_dict.get(
|
|
"object_group_members_green", []
|
|
)
|
|
# if not a list and is a string, convert to list
|
|
if isinstance(object_group_members_green, str):
|
|
object_group_members_green = [object_group_members_green]
|
|
|
|
object_group_members_red = object_logical_group_dict.get(
|
|
"object_group_members_red", []
|
|
)
|
|
# if not a list and is a string, convert to list
|
|
if isinstance(object_group_members_red, str):
|
|
object_group_members_red = [object_group_members_red]
|
|
|
|
# if object_state is green, object_name must be in object_group_members_green but not in object_group_members_red
|
|
# if object_state is red or blue, object_name must be in object_group_members_red but not in object_group_members_green
|
|
# any change required to object_group_members_green and object_group_members_red implies an update to the KVstore record is required, set the boolean uppdate_kvstore_record to True if required
|
|
|
|
update_kvstore_record = False
|
|
|
|
if object_state == "green":
|
|
if object_name not in object_group_members_green:
|
|
object_group_members_green.append(object_name)
|
|
update_kvstore_record = True
|
|
if object_name in object_group_members_red:
|
|
object_group_members_red.remove(object_name)
|
|
update_kvstore_record = True
|
|
else:
|
|
if object_name not in object_group_members_red:
|
|
object_group_members_red.append(object_name)
|
|
update_kvstore_record = True
|
|
if object_name in object_group_members_green:
|
|
object_group_members_green.remove(object_name)
|
|
update_kvstore_record = True
|
|
|
|
# if update_kvstore_record is True, call the API endpoint accordingly
|
|
if update_kvstore_record:
|
|
# proceed
|
|
try:
|
|
response = logical_group_update_green_red_members(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
object_name,
|
|
object_group_key,
|
|
object_group_members_green,
|
|
object_group_members_red,
|
|
)
|
|
logging.info(
|
|
f'tenant="{tenant_id}", object="{object_name}", logical group green/red members update API was successfull, response="{response}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant="{tenant_id}", object="{object_name}", logical group green/red members update API call has failed, exception="{str(e)}"'
|
|
)
|
|
|
|
# ensure object_group_min_green_percent is float
|
|
try:
|
|
object_group_min_green_percent = float(object_group_min_green_percent)
|
|
except:
|
|
object_group_min_green_percent = 0
|
|
|
|
# calculate object_group_green_percent, if logical group is empty, then object_group_green_percent is 100
|
|
try:
|
|
if object_group_members_count > 0:
|
|
object_group_green_percent = (
|
|
len(object_group_members_green) / object_group_members_count
|
|
) * 100
|
|
else:
|
|
object_group_green_percent = 100
|
|
except:
|
|
object_group_green_percent = 0
|
|
|
|
# define status and return
|
|
|
|
if object_group_key != "" and object_group_members_count > 1:
|
|
isUnderLogicalGroup = True
|
|
if object_group_green_percent < object_group_min_green_percent:
|
|
LogicalGroupStateInAlert = True
|
|
LogicalGroupMsg = f"""Logical Group {object_group_name} with key="{object_group_key}" is in alert state. The current green percentage of the group is {round(object_group_green_percent, 2)}% which is lower than the minimum green percentage of {round(object_group_min_green_percent, 2)}%, object_group_members_count={object_group_members_count}, object_group_members_red={object_group_members_red}"""
|
|
|
|
else:
|
|
LogicalGroupStateInAlert = False
|
|
LogicalGroupMsg = f"""Logical Group {object_group_name} with key="{object_group_key}" is in normal state. The current green percentage of the group is {round(object_group_green_percent, 2)}% which is higher or equal to the minimal green percentage of {round(object_group_min_green_percent, 2)}%, object_group_members_count={object_group_members_count}, object_group_members_red={object_group_members_red}"""
|
|
|
|
else:
|
|
isUnderLogicalGroup = False
|
|
LogicalGroupStateInAlert = False
|
|
LogicalGroupMsg = ""
|
|
|
|
return isUnderLogicalGroup, LogicalGroupStateInAlert, LogicalGroupMsg
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'function get_and_manage_logical_group_status has failed, exception="{str(e)}", object_name="{object_name}", object_group_key="{object_group_key}", object_logical_group_dict="{object_logical_group_dict}"'
|
|
)
|
|
return (
|
|
False,
|
|
False,
|
|
f'function get_and_manage_logical_group_status has failed, exception="{str(e)}", object_name="{object_name}", object_group_key="{object_group_key}"',
|
|
)
|
|
|
|
|
|
def get_dsm_latency_status(
|
|
data_last_ingestion_lag_seen,
|
|
data_max_lag_allowed,
|
|
data_last_ingest,
|
|
data_last_time_seen,
|
|
):
|
|
"""
|
|
Create a function called get_dsm_latency_status:
|
|
- arguments: data_last_ingestion_lag_seen, data_max_lag_allowed, data_last_ingest, data_last_time_seen
|
|
- returns: isUnderLatencyAlert (boolean), isUnderLatencyMessage (string)
|
|
- behaviour:
|
|
isUnderLatencyAlert: If data_last_ingestion_lag_seen is higher than data_max_lag_allowed, then isUnderLatencyAlert is True
|
|
isUnderLatencyMessage: If isUnderLatencyAlert is True:
|
|
"Monitoring conditions are not met due to latency issues. Ingestion latency is $data_last_ingestion_lag_seen$ seconds (duration: <conversion of data_last_ingestion_lag_seen to duration), which is higher than the maximum allowed latency of $data_max_lag_allowed$ seconds (duration: <conversion of data_max_lag_allowed to duration>), latest event available for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
|
|
isUnderLatencyMessage: If isUnderLatencyAlert is False:
|
|
"Monitoring conditions are not met for ingest latency are met. Ingestion latency is $data_last_ingestion_lag_seen$ seconds (duration: <conversion of data_last_ingestion_lag_seen to duration), which is lower than the maximum allowed latency of $data_max_lag_allowed$ seconds (duration: <conversion of data_max_lag_allowed to duration>), latest event available for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
|
|
"""
|
|
|
|
# convert data_last_ingestion_lag_seen to float
|
|
try:
|
|
data_last_ingestion_lag_seen = float(data_last_ingestion_lag_seen)
|
|
except:
|
|
data_last_ingestion_lag_seen = 0
|
|
|
|
# convert data_max_lag_allowed to float
|
|
try:
|
|
data_max_lag_allowed = float(data_max_lag_allowed)
|
|
except:
|
|
data_max_lag_allowed = 0
|
|
|
|
# convert data_last_ingest to float
|
|
try:
|
|
data_last_ingest = float(data_last_ingest)
|
|
except:
|
|
data_last_ingest = 0
|
|
|
|
# convert data_last_time_seen to float
|
|
try:
|
|
data_last_time_seen = float(data_last_time_seen)
|
|
except:
|
|
data_last_time_seen = 0
|
|
|
|
# convert data_last_ingestion_lag_seen to duration
|
|
data_last_ingestion_lag_seen_duration = convert_seconds_to_duration(
|
|
data_last_ingestion_lag_seen
|
|
)
|
|
|
|
# convert data_max_lag_allowed to duration
|
|
data_max_lag_allowed_duration = convert_seconds_to_duration(data_max_lag_allowed)
|
|
|
|
# convert data_last_ingest to %c
|
|
data_last_ingest_datetime = convert_epoch_to_datetime(data_last_ingest)
|
|
|
|
# calculate the time since last ingestion in seconds
|
|
time_since_last_ingestion = time.time() - data_last_ingest
|
|
|
|
# convert data_last_time_seen to %c
|
|
data_last_time_seen_datetime = convert_epoch_to_datetime(data_last_time_seen)
|
|
|
|
# calculate the time since last event in seconds
|
|
time_since_last_event = time.time() - data_last_time_seen
|
|
|
|
# define isUnderLatencyAlert
|
|
if float(data_last_ingestion_lag_seen) > float(data_max_lag_allowed):
|
|
isUnderLatencyAlert = True
|
|
else:
|
|
isUnderLatencyAlert = False
|
|
|
|
# define isUnderLatencyMessage
|
|
if isUnderLatencyAlert:
|
|
|
|
# if the time since last ingestion and the time since last event are less than data_max_lag_allowed, then indicate that might be receiving a mix of delayed and non-delayed events
|
|
if (
|
|
time_since_last_ingestion < data_max_lag_allowed
|
|
and time_since_last_event < data_max_lag_allowed
|
|
):
|
|
isUnderLatencyMessage = f"""Monitoring conditions are not met due to latency issues. Ingestion latency is approximately {round(float(data_last_ingestion_lag_seen), 3)} seconds (duration: {data_last_ingestion_lag_seen_duration}), which is higher than the maximum allowed latency of {int(data_max_lag_allowed)} seconds (duration: {data_max_lag_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event indexed (_indextime) for this entity: {data_last_ingest_datetime}, this indicates that the source might be receiving a mix of delayed and non-delayed events"""
|
|
else:
|
|
isUnderLatencyMessage = f"""Monitoring conditions are not met due to latency issues. Ingestion latency is approximately {round(float(data_last_ingestion_lag_seen), 3)} seconds (duration: {data_last_ingestion_lag_seen_duration}), which is higher than the maximum allowed latency of {int(data_max_lag_allowed)} seconds (duration: {data_max_lag_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event indexed (_indextime) for this entity: {data_last_ingest_datetime}, this indicates that the source is receiving delayed events only"""
|
|
|
|
else:
|
|
isUnderLatencyMessage = f"""monitoring conditions for ingest latency are met. Ingestion latency is approximately {round(float(data_last_ingestion_lag_seen), 3)} seconds (duration: {data_last_ingestion_lag_seen_duration}), which is lower than the maximum allowed latency of {int(data_max_lag_allowed)} seconds (duration: {data_max_lag_allowed_duration}), latest event indexed (_indextime) for this entity: {data_last_ingest_datetime}"""
|
|
|
|
# return
|
|
return isUnderLatencyAlert, isUnderLatencyMessage
|
|
|
|
|
|
def get_dsm_delay_status(
|
|
data_last_lag_seen,
|
|
data_max_delay_allowed,
|
|
data_last_ingest,
|
|
data_last_time_seen,
|
|
):
|
|
"""
|
|
Create a function called get_dsm_delay_status:
|
|
- arguments: data_last_lag_seen, data_max_delay_allowed, data_last_ingest, data_last_time_seen
|
|
- returns: isUnderDelayAlert (boolean), isUnderDelayMessage (string)
|
|
- behaviour:
|
|
isUnderDelayAlert: If data_last_lag_seen is higher than data_max_delay_allowed, then isUnderDelayAlert is True
|
|
isUnderDelayMessage: If isUnderDelayAlert is True:
|
|
"Monitoring conditions are not met due to delay issues. Event delay is $data_last_lag_seen$ seconds (duration: <conversion of data_last_lag_seen to duration), which is higher than the maximum allowed delay of $data_max_delay_allowed$ seconds (duration: <conversion of data_max_delay_allowed to duration>), latest event available (_time) for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
|
|
isUnderDelayMessage: If isUnderDelayAlert is False:
|
|
"monitoring conditions for event delay are met. Event delay is $data_last_lag_seen$ seconds (duration: <conversion of data_last_lag_seen to duration), which is lower than the maximum allowed delay of $data_max_delay_allowed$ seconds (duration: <conversion of data_max_delay_allowed to duration>), latest event available for this entity: <convertion of epoch data_last_time_seen to %c>, latest event ingested for this entity: <convertion of epoch data_last_ingest to %c>"
|
|
"""
|
|
|
|
# convert data_last_lag_seen to float
|
|
try:
|
|
data_last_lag_seen = float(data_last_lag_seen)
|
|
except:
|
|
data_last_lag_seen = 0
|
|
|
|
# convert data_max_delay_allowed to float
|
|
try:
|
|
data_max_delay_allowed = float(data_max_delay_allowed)
|
|
except:
|
|
data_max_delay_allowed = 0
|
|
|
|
# convert data_last_ingest to float
|
|
try:
|
|
data_last_ingest = float(data_last_ingest)
|
|
except:
|
|
data_last_ingest = 0
|
|
|
|
# convert data_last_time_seen to float
|
|
try:
|
|
data_last_time_seen = float(data_last_time_seen)
|
|
except:
|
|
data_last_time_seen = 0
|
|
|
|
# convert data_last_lag_seen to duration
|
|
data_last_lag_seen_duration = convert_seconds_to_duration(data_last_lag_seen)
|
|
|
|
# convert data_max_delay_allowed to duration
|
|
data_max_delay_allowed_duration = convert_seconds_to_duration(
|
|
data_max_delay_allowed
|
|
)
|
|
|
|
# convert data_last_ingest to %c
|
|
data_last_ingest_datetime = convert_epoch_to_datetime(data_last_ingest)
|
|
|
|
# convert data_last_time_seen to %c
|
|
data_last_time_seen_datetime = convert_epoch_to_datetime(data_last_time_seen)
|
|
|
|
# define isUnderDelayAlert
|
|
if float(data_last_lag_seen) > float(data_max_delay_allowed):
|
|
isUnderDelayAlert = True
|
|
else:
|
|
isUnderDelayAlert = False
|
|
|
|
# define isUnderDelayMessage
|
|
if isUnderDelayAlert:
|
|
isUnderDelayMessage = f"""Monitoring conditions are not met due to delay issues. Event delay is {round(float(data_last_lag_seen), 3)} seconds (duration: {data_last_lag_seen_duration}), which is higher than the maximum allowed delay of {int(round(float(data_max_delay_allowed), 0))} seconds (duration: {data_max_delay_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}, latest event ingested (_indextime) for this entity: {data_last_ingest_datetime}. This incidates that the source is receiving events with timestamps older than the threshold defined for this entity."""
|
|
else:
|
|
isUnderDelayMessage = f"""monitoring conditions for event delay are met. Event delay is {round(float(data_last_lag_seen), 3)} seconds (duration: {data_last_lag_seen_duration}), which is lower than the maximum allowed delay of {int(round(float(data_max_delay_allowed), 0))} seconds (duration: {data_max_delay_allowed_duration}), latest event available (_time) for this entity: {data_last_time_seen_datetime}"""
|
|
|
|
# return
|
|
return isUnderDelayAlert, isUnderDelayMessage
|
|
|
|
|
|
def set_dsm_status(
|
|
logger,
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isAnomaly,
|
|
isFuture,
|
|
isFutureMsg,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
isUnderDcountHost,
|
|
isUnderDcountHostMsg,
|
|
object_logical_group_dict,
|
|
isUnderLatencyAlert,
|
|
isUnderLatencyMessage,
|
|
isUnderDelayAlert,
|
|
isUnderDelayMessage,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler=None,
|
|
monitoring_anomaly_reason=None,
|
|
score=None,
|
|
score_outliers=None,
|
|
vtenant_account=None,
|
|
):
|
|
"""
|
|
Create a function called set_dsm_status:
|
|
- arguments: record, isOutlier, isAnomaly, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderDcountHost, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
|
|
- returns:
|
|
object_state (string): blue, orange, green, red
|
|
anomaly_reason (list): list of short code reasons why the object is in anomaly
|
|
status_message (list): list of long description reasons why the object is in anomaly
|
|
- behaviour:
|
|
object_state:
|
|
green if:
|
|
isOutlier is 1
|
|
isAnomaly is 1
|
|
isFuture is False
|
|
isUnderMonitoring is True
|
|
isUnderDcountHost is False
|
|
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
|
|
isUnderLatencyAlert is False
|
|
isUnderDelayAlert is False
|
|
blue if:
|
|
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
|
|
orange if:
|
|
All green conditions are met except for isFuture which would be True
|
|
red if:
|
|
Any of the green conditions are not met, and blue conditions and orange conditions are not met
|
|
anomaly_reason:
|
|
if object_state is green, anomnaly_reason is None
|
|
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
|
|
"""
|
|
|
|
# init status_message and anomaly_reason
|
|
status_message = []
|
|
anomaly_reason = []
|
|
|
|
# init status_message_json
|
|
status_message_json = {}
|
|
|
|
# init original_object_state
|
|
original_object_state = record.get("object_state", "green")
|
|
|
|
# define object_state
|
|
# Check outliers: if isOutlier == 1 but score_outliers <= 0, treat as no outlier (suppressed)
|
|
isOutlierEffective = isOutlier == 1
|
|
if score_outliers is not None and score_outliers <= 0:
|
|
# Outliers are suppressed (false positive), don't treat as outlier
|
|
isOutlierEffective = False
|
|
|
|
if (
|
|
(isOutlierEffective == False or isOutlier == 2)
|
|
and (isAnomaly == 0 or isAnomaly == 2)
|
|
and isUnderDcountHost is False
|
|
and isUnderLatencyAlert is False
|
|
and isUnderDelayAlert is False
|
|
):
|
|
object_state = "green"
|
|
else:
|
|
object_state = "red"
|
|
|
|
#
|
|
# Logical group management
|
|
#
|
|
|
|
(
|
|
isUnderLogicalGroup,
|
|
LogicalGroupStateInAlert,
|
|
LogicalGroupMsg,
|
|
) = get_and_manage_logical_group_status(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record.get("object"),
|
|
object_state,
|
|
record.get("object_group_key"),
|
|
object_logical_group_dict,
|
|
)
|
|
|
|
# log debug
|
|
logging.debug(
|
|
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
|
|
)
|
|
|
|
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
|
|
if object_state == "red" and isUnderLogicalGroup is True:
|
|
if LogicalGroupStateInAlert is False:
|
|
object_state = "blue"
|
|
|
|
# if object_state is not red or blue but isFuture is True, then object_state is orange
|
|
if object_state not in ["red", "blue"]:
|
|
if isFuture is True:
|
|
object_state = "orange"
|
|
|
|
# if object_state is red but isUnderMonitoring is False, then object_state is orange
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
object_state = "orange"
|
|
|
|
#
|
|
# Hybrid scoring: Apply score-based logic
|
|
# Outliers are handled separately via score_outliers in get_outliers_status
|
|
#
|
|
total_score = None
|
|
score_definition = {}
|
|
if score is not None:
|
|
# Calculate total score with static increments for anomalies
|
|
base_score = float(score) if score is not None else 0.0
|
|
total_score = base_score
|
|
|
|
# Build score definition to track where the score comes from
|
|
# Convert base_score to integer if it's a whole number, otherwise keep as float
|
|
if base_score == int(base_score):
|
|
score_definition["base_score"] = int(base_score)
|
|
else:
|
|
score_definition["base_score"] = base_score
|
|
score_definition["components"] = []
|
|
|
|
# Add static increments for each anomaly type (using VT-specific impact scores)
|
|
if isAnomaly == 1:
|
|
increment = get_impact_score(vtenant_account, "impact_score_dsm_data_sampling_anomaly", 36)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "data_sampling_anomaly",
|
|
"score": increment,
|
|
"description": "Data sampling anomaly detected"
|
|
})
|
|
|
|
if isUnderDelayAlert is True:
|
|
increment = get_entity_impact_score(record, "dsm", "delay", vtenant_account, 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "delay_threshold_breach",
|
|
"score": increment,
|
|
"description": "Delay threshold breached"
|
|
})
|
|
|
|
if isUnderLatencyAlert is True:
|
|
increment = get_entity_impact_score(record, "dsm", "latency", vtenant_account, 48)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "latency_threshold_breach",
|
|
"score": increment,
|
|
"description": "Latency threshold breached"
|
|
})
|
|
|
|
if isUnderDcountHost is True:
|
|
increment = get_impact_score(vtenant_account, "impact_score_dsm_min_hosts_dcount_breach", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "min_hosts_dcount_breach",
|
|
"score": increment,
|
|
"description": "Minimum hosts dcount threshold breached"
|
|
})
|
|
|
|
if isFuture is True:
|
|
increment = get_impact_score(vtenant_account, "impact_score_dsm_future_tolerance_breach", 36)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "future_tolerance_breach",
|
|
"score": increment,
|
|
"description": "Future tolerance breached"
|
|
})
|
|
|
|
# Add outlier score if present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_definition["score_outliers"] = float(score_outliers)
|
|
|
|
# Add score sources if available
|
|
score_source = record.get("score_source", [])
|
|
if score_source:
|
|
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
|
|
|
|
# Check for manual_score increases (positive scores from manual_score source)
|
|
# If manual_score increases the score without a related anomaly, add score_breached component
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
if "manual_score" in score_source_list and total_score and total_score > 0:
|
|
# Check if there are no other anomaly components
|
|
has_other_anomalies = (
|
|
isAnomaly == 1
|
|
or isUnderDelayAlert is True
|
|
or isUnderLatencyAlert is True
|
|
or isUnderDcountHost is True
|
|
or isFuture is True
|
|
or (score_outliers is not None and score_outliers > 0)
|
|
)
|
|
if not has_other_anomalies:
|
|
# Manual score increase without other anomalies - add score_breached component
|
|
score_definition["components"].append({
|
|
"type": "manual_score",
|
|
"score": 0, # Score is already included in base_score calculation
|
|
"description": "Manual score influence applied without related anomaly"
|
|
})
|
|
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Apply score-based logic:
|
|
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
|
|
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
|
|
# - If total_score == 0: keep current state
|
|
|
|
if total_score >= 100:
|
|
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red (score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
|
|
)
|
|
elif total_score > 0 and total_score < 100:
|
|
# If score > 0 and < 100, entity should be orange (even if currently green)
|
|
if object_state == "green":
|
|
object_state = "orange"
|
|
# Add status message about score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
# Add outlier context if outliers are present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
|
|
)
|
|
elif object_state == "red":
|
|
# Downgrade red to orange if score < 100
|
|
# Only apply score-based downgrade if the red state is NOT due to outliers
|
|
# (outliers with score_outliers >= 100 should still be red)
|
|
if isOutlier != 1:
|
|
object_state = "orange"
|
|
# Add status message about score when downgrading
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
|
|
)
|
|
else:
|
|
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
|
|
# in get_outliers_status, so we can still apply score-based logic
|
|
if score_outliers is not None and score_outliers < 100:
|
|
object_state = "orange"
|
|
# Add status message about score when downgrading due to low outlier score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'downgrading red to orange (outlier score too low)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
|
|
)
|
|
else:
|
|
# total_score == 0 or total_score <= 0
|
|
# Check if score is 0 due to false_positive (global false positive, not just outliers)
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive:
|
|
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
|
|
object_state = "green"
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_source="{score_source}", '
|
|
f'setting state to green (false positive set, score cancelled)'
|
|
)
|
|
elif score_outliers is not None and score_outliers <= 0:
|
|
# Check if there are any other issues
|
|
has_other_issues = (
|
|
(isAnomaly == 1)
|
|
or isUnderDcountHost is True
|
|
or isUnderLatencyAlert is True
|
|
or isUnderDelayAlert is True
|
|
)
|
|
if not has_other_issues:
|
|
# Outliers are suppressed (false positive), and no other issues, set to green
|
|
object_state = "green"
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'setting state to green (outliers suppressed, no other issues)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'keeping current state (score == 0, but other issues present)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dsm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'keeping current state (score == 0)'
|
|
)
|
|
|
|
# define anomaly_reason
|
|
if object_state == "green":
|
|
status_message.append(isUnderDelayMessage)
|
|
status_message.append(isUnderLatencyMessage)
|
|
|
|
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive and score_definition and "components" in score_definition:
|
|
# Extract anomaly reasons from score_definition components
|
|
for component in score_definition.get("components", []):
|
|
component_type = component.get("type")
|
|
if component_type:
|
|
mapped_reason = get_anomaly_reason_from_component_type(component_type)
|
|
if mapped_reason and mapped_reason not in anomaly_reason:
|
|
anomaly_reason.append(mapped_reason)
|
|
# If no components found, still add "none"
|
|
if not anomaly_reason:
|
|
anomaly_reason.append("none")
|
|
else:
|
|
anomaly_reason.append("none")
|
|
|
|
# if in a logical group, add the logical group message
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
|
|
else:
|
|
# Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
|
|
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
|
|
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
|
|
outlier_reasons = record.get("isOutlierReason", [])
|
|
if outlier_reasons:
|
|
if isinstance(outlier_reasons, list):
|
|
# Join the list elements into a single string
|
|
outlier_reasons_str = " | ".join(outlier_reasons)
|
|
status_message.append(outlier_reasons_str)
|
|
else:
|
|
# If it's not a list, append it directly
|
|
status_message.append(outlier_reasons)
|
|
# Add ml_outliers_detection to anomaly_reason for all outlier cases
|
|
if "ml_outliers_detection" not in anomaly_reason:
|
|
anomaly_reason.append("ml_outliers_detection")
|
|
|
|
# Add status message for orange state (score_outliers > 0 and < 100)
|
|
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
|
|
base_score = float(score) if score is not None else 0.0
|
|
status_message.append(
|
|
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
|
|
if isAnomaly == 1:
|
|
status_message.append(
|
|
"anomalies detected in the data sampling and format recognition, review the data sampling screen to investigate. This alert means that trackMe detected an issue in the format of the events compared to the format that was previously identified for this source"
|
|
)
|
|
anomaly_reason.append("data_sampling_anomaly")
|
|
|
|
if isFuture is True:
|
|
status_message.append(isFutureMsg)
|
|
anomaly_reason.append("future_over_tolerance")
|
|
|
|
# Monitoring time policy, add the message first then the anomaly reason
|
|
if isUnderMonitoring is False:
|
|
status_message.append(isUnderMonitoringMsg)
|
|
|
|
# Use new monitoring anomaly reason if provided, otherwise use legacy separate reasons
|
|
if monitoring_anomaly_reason:
|
|
anomaly_reason.append(monitoring_anomaly_reason)
|
|
|
|
if isUnderDcountHost is True:
|
|
status_message.append(isUnderDcountHostMsg)
|
|
anomaly_reason.append("min_hosts_dcount")
|
|
|
|
if isUnderLatencyAlert is True:
|
|
status_message.append(isUnderLatencyMessage)
|
|
anomaly_reason.append("lag_threshold_breached")
|
|
|
|
if isUnderDelayAlert is True:
|
|
status_message.append(isUnderDelayMessage)
|
|
anomaly_reason.append("delay_threshold_breached")
|
|
|
|
# logical group
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
anomaly_reason.append("in_logical_group")
|
|
|
|
# form status_message_json
|
|
status_message_json["status_message"] = status_message
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
|
|
# Add score information to status_message_json for UI display (sorted alphabetically)
|
|
# Use total_score if calculated (hybrid scoring), otherwise use base score
|
|
if total_score is not None:
|
|
status_message_json["score"] = float(total_score)
|
|
# Update record score to reflect the calculated total_score for UI consistency
|
|
record["score"] = float(total_score)
|
|
# Add score definition for drilldown modal
|
|
if score_definition:
|
|
status_message_json["score_definition"] = score_definition
|
|
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
|
|
elif score is not None:
|
|
status_message_json["score"] = float(score)
|
|
if score_outliers is not None:
|
|
status_message_json["score_outliers"] = float(score_outliers)
|
|
if total_score is not None:
|
|
status_message_json["total_score"] = float(total_score)
|
|
|
|
# get disruption_duration
|
|
if not disruption_queue_record:
|
|
record["disruption_min_time_sec"] = 0
|
|
|
|
else:
|
|
|
|
logger.debug(
|
|
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
|
|
)
|
|
|
|
disruption_object_state = disruption_queue_record.get("object_state", "green")
|
|
try:
|
|
disruption_min_time_sec = int(
|
|
disruption_queue_record.get("disruption_min_time_sec", 0)
|
|
)
|
|
except:
|
|
disruption_min_time_sec = 0
|
|
# add to the record
|
|
record["disruption_min_time_sec"] = disruption_min_time_sec
|
|
|
|
try:
|
|
disruption_start_epoch = float(
|
|
disruption_queue_record.get("disruption_start_epoch", 0)
|
|
)
|
|
except:
|
|
disruption_start_epoch = 0
|
|
|
|
# Case 1: Entity is no longer in alert state (not red)
|
|
if object_state != "red":
|
|
# Only update if we were previously tracking a disruption
|
|
if disruption_object_state == "red":
|
|
disruption_queue_record["object_state"] = object_state
|
|
disruption_queue_record["disruption_start_epoch"] = 0
|
|
disruption_queue_record["mtime"] = time.time()
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# Case 2: Entity is in alert state (red)
|
|
if object_state == "red":
|
|
current_time = time.time()
|
|
|
|
# If this is a new disruption, start tracking it
|
|
if disruption_object_state != "red":
|
|
disruption_queue_record["object_state"] = "red"
|
|
disruption_queue_record["disruption_start_epoch"] = current_time
|
|
disruption_queue_record["mtime"] = current_time
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
|
|
# For new disruptions, if min time is set, show as blue with message
|
|
if disruption_min_time_sec > 0:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# If we're already tracking a disruption, check duration
|
|
if disruption_min_time_sec > 0:
|
|
try:
|
|
disruption_duration = current_time - disruption_start_epoch
|
|
except Exception as e:
|
|
logger.error(f"error calculating disruption_duration: {e}")
|
|
disruption_duration = 0
|
|
|
|
# If duration hasn't breached threshold, show as blue with message
|
|
if disruption_duration < disruption_min_time_sec:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
|
|
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
|
|
# return
|
|
logging.debug(
|
|
f'set_dsm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
return (
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
)
|
|
|
|
|
|
def set_dhm_status(
|
|
logger,
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isFuture,
|
|
isFutureMsg,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
object_logical_group_dict,
|
|
isUnderLatencyAlert,
|
|
isUnderLatencyMessage,
|
|
isUnderDelayAlert,
|
|
isUnderDelayMessage,
|
|
default_splk_dhm_alerting_policy,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler=None,
|
|
monitoring_anomaly_reason=None,
|
|
score=None,
|
|
score_outliers=None,
|
|
vtenant_account=None,
|
|
):
|
|
"""
|
|
Create a function called set_dhm_status:
|
|
- arguments: record, isOutlier, isFuture, isFutureMsg, isUnderMonitoring, isUnderMonitoringMsg, object_logical_group_dict, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage, default_splk_dhm_alerting_policy
|
|
- returns:
|
|
object_state (string): blue, orange, green, red
|
|
anomaly_reason (list): list of short code reasons why the object is in anomaly
|
|
status_message (list): list of long description reasons why the object is in anomaly
|
|
- behaviour:
|
|
object_state:
|
|
green if:
|
|
isOutlier is 1
|
|
isFuture is False
|
|
isUnderMonitoring is True
|
|
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
|
|
isUnderLatencyAlert is False
|
|
isUnderDelayAlert is False
|
|
blue if:
|
|
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
|
|
orange if:
|
|
All green conditions are met except for isFuture which would be True
|
|
red if:
|
|
Any of the green conditions are not met, and blue conditions and orange conditions are not met
|
|
anomaly_reason:
|
|
if object_state is green, anomnaly_reason is None
|
|
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
|
|
"""
|
|
|
|
# init status_message and anomaly_reason
|
|
status_message = []
|
|
anomaly_reason = []
|
|
|
|
# init status_message_json
|
|
status_message_json = {}
|
|
|
|
# define object_state
|
|
# Check outliers: if isOutlier == 1 but score_outliers <= 0, treat as no outlier (suppressed)
|
|
isOutlierEffective = isOutlier == 1
|
|
if score_outliers is not None and score_outliers <= 0:
|
|
# Outliers are suppressed (false positive), don't treat as outlier
|
|
isOutlierEffective = False
|
|
|
|
if (
|
|
(isOutlierEffective == False or isOutlier == 2)
|
|
and isUnderLatencyAlert is False
|
|
and isUnderDelayAlert is False
|
|
):
|
|
object_state = "green"
|
|
else:
|
|
object_state = "red"
|
|
|
|
#
|
|
# Logical group management
|
|
#
|
|
|
|
(
|
|
isUnderLogicalGroup,
|
|
LogicalGroupStateInAlert,
|
|
LogicalGroupMsg,
|
|
) = get_and_manage_logical_group_status(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record.get("object"),
|
|
object_state,
|
|
record.get("object_group_key"),
|
|
object_logical_group_dict,
|
|
)
|
|
|
|
# log debug
|
|
logging.debug(
|
|
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
|
|
)
|
|
|
|
splk_dhm_alerting_policy = record.get("splk_dhm_alerting_policy", "global_policy")
|
|
if not len(splk_dhm_alerting_policy) > 0:
|
|
splk_dhm_alerting_policy = "global_policy"
|
|
splk_dhm_st_summary = record.get("splk_dhm_st_summary")
|
|
|
|
# get the entity global max delay allowed
|
|
global_max_delay_allowed = int(
|
|
round(float(record.get("data_max_delay_allowed", 0)), 0)
|
|
)
|
|
|
|
# get the entity global max lag allowed
|
|
global_max_lag_allowed = int(round(float(record.get("data_max_lag_allowed", 0)), 0))
|
|
|
|
# get the entity global last delay seen (data_last_lag_seen)
|
|
global_last_event_lag = int(round(float(record.get("data_last_lag_seen", 0)), 0))
|
|
|
|
# get the entity global last lag seen (data_last_ingestion_lag_seen)
|
|
global_last_ingest_lag = int(
|
|
round(float(record.get("data_last_ingestion_lag_seen", 0)), 0)
|
|
)
|
|
|
|
# Convert splk_dhm_st_summary to a list if it is a string
|
|
if isinstance(splk_dhm_st_summary, str):
|
|
splk_dhm_st_summary = [splk_dhm_st_summary]
|
|
|
|
# counters
|
|
count_red = 0
|
|
count_green = 0
|
|
sourcetypes_red_list = []
|
|
|
|
# retrieve host_idx_blocklists, host_st_blocklists
|
|
host_idx_blocklists = record.get("host_idx_blocklists", [])
|
|
host_st_blocklists = record.get("host_st_blocklists", [])
|
|
|
|
# if string, then turn into list from comma separated string
|
|
if isinstance(host_idx_blocklists, str):
|
|
host_idx_blocklists = host_idx_blocklists.split(",")
|
|
if isinstance(host_st_blocklists, str):
|
|
host_st_blocklists = host_st_blocklists.split(",")
|
|
|
|
# splk_dhm_st_summary can actually be a list
|
|
if isinstance(splk_dhm_st_summary, list):
|
|
|
|
for item_str in splk_dhm_st_summary:
|
|
dict_str = "{" + item_str + "}"
|
|
dict_loaded = False
|
|
dict_loading_error = []
|
|
|
|
# try both options
|
|
try:
|
|
new_dict = ast.literal_eval(dict_str)
|
|
dict_loaded = True
|
|
except Exception as e:
|
|
dict_loaded = False
|
|
dict_loading_error.append(str(e))
|
|
|
|
if not dict_loaded:
|
|
try:
|
|
new_dict = json.loads(item_str)
|
|
dict_loaded = True
|
|
except Exception as e:
|
|
dict_loaded = False
|
|
dict_loading_error.append(str(e))
|
|
|
|
if dict_loaded:
|
|
|
|
# handle blocklists
|
|
new_dict = {
|
|
key: val
|
|
for key, val in new_dict.items()
|
|
if val["idx"] not in host_idx_blocklists
|
|
and val["st"] not in host_st_blocklists
|
|
}
|
|
|
|
# Iterate through the inner dictionaries
|
|
for inner_dict in new_dict.values():
|
|
|
|
if inner_dict.get("state") == "red":
|
|
count_red += 1
|
|
|
|
max_lag_allowed = float(inner_dict.get("max_lag_allowed"))
|
|
max_delay_allowed = float(inner_dict.get("max_delay_allowed"))
|
|
last_ingest_lag = float(inner_dict.get("last_ingest_lag"))
|
|
last_event_lag = float(inner_dict.get("last_event_lag"))
|
|
|
|
if (
|
|
last_ingest_lag > max_lag_allowed
|
|
and "lag_threshold_breached" not in anomaly_reason
|
|
):
|
|
anomaly_reason.append("lag_threshold_breached")
|
|
sourcetypes_red_list.append(
|
|
f'(idx: {inner_dict.get("idx")}, st: {inner_dict.get("st")}, anomaly_reason: lag_threshold_breached)'
|
|
)
|
|
if (
|
|
last_event_lag > max_delay_allowed
|
|
and "delay_threshold_breached" not in anomaly_reason
|
|
):
|
|
anomaly_reason.append("delay_threshold_breached")
|
|
sourcetypes_red_list.append(
|
|
f'(idx: {inner_dict.get("idx")}, st: {inner_dict.get("st")}, anomaly_reason: delay_threshold_breached)'
|
|
)
|
|
|
|
elif inner_dict.get("state") == "green":
|
|
count_green += 1
|
|
|
|
else:
|
|
logging.error(
|
|
f"Error in processing item_str: {item_str}. Error: {dict_loading_error}"
|
|
)
|
|
|
|
logging.debug(
|
|
f'object="{record.get("object")}", count_red={count_red}, count_green={count_green}'
|
|
)
|
|
|
|
# turn sourcetypes_red_list into a pipe separated string
|
|
sourcetypes_red_list = "|".join(sourcetypes_red_list)
|
|
|
|
# Decision making based on the counts of red and green states
|
|
if splk_dhm_alerting_policy == "global_policy":
|
|
if default_splk_dhm_alerting_policy == "track_per_host":
|
|
# Use object_state as it is
|
|
pass
|
|
elif default_splk_dhm_alerting_policy == "track_per_sourcetype":
|
|
if count_red > 0:
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"One or more sourcetypes are in alert for this entity, and policy is set to track_per_sourcetype, sourcetypes in alert: {sourcetypes_red_list}"
|
|
)
|
|
else:
|
|
# Use object_state as it is
|
|
pass
|
|
elif splk_dhm_alerting_policy == "track_per_host":
|
|
# Use object_state as it is
|
|
pass
|
|
elif splk_dhm_alerting_policy == "track_per_sourcetype":
|
|
if count_red > 0:
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"One or more sourcetypes are in alert for this entity, and policy is set to track_per_sourcetype, sourcetypes in alert: {sourcetypes_red_list}"
|
|
)
|
|
else:
|
|
# Use object_state as it is
|
|
pass
|
|
else:
|
|
# Use object_state as it is
|
|
pass
|
|
|
|
# if all sourcetypes are in alert, object_state is red or orange depending on the global max delay entity values
|
|
if (
|
|
count_green == 0
|
|
and (global_last_event_lag >= global_max_delay_allowed)
|
|
and (global_last_ingest_lag >= global_max_lag_allowed)
|
|
):
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"all sourcetypes are in alert for this entity, global entity max delay allowed is breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)}), global entity max lag allowed is breached (max_delay_allowed: {global_max_lag_allowed} seconds, duration: {convert_seconds_to_duration(global_max_lag_allowed)}), last_event_lag: {global_last_ingest_lag} seconds, duration: {convert_seconds_to_duration(global_last_ingest_lag)})"
|
|
)
|
|
|
|
elif (
|
|
count_green == 0
|
|
and (global_last_event_lag < global_max_delay_allowed)
|
|
and (global_last_ingest_lag >= global_max_lag_allowed)
|
|
):
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"all sourcetypes are in alert for this entity, global entity max delay allowed is not breached but max lag allowed is breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
|
|
)
|
|
|
|
elif (
|
|
count_green == 0
|
|
and (global_last_event_lag >= global_max_delay_allowed)
|
|
and (global_last_ingest_lag < global_max_lag_allowed)
|
|
):
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"all sourcetypes are in alert for this entity, global entity max delay allowed is breached but max lag allowed is not breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
|
|
)
|
|
|
|
elif (
|
|
count_green == 0
|
|
and (global_last_event_lag < global_max_delay_allowed)
|
|
and (global_last_ingest_lag < global_max_lag_allowed)
|
|
):
|
|
object_state = "green"
|
|
status_message.append(
|
|
f"all sourcetypes are in alert for this entity, however global entity max delay allowed and max lag allowed are not breached (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
|
|
)
|
|
|
|
elif count_green == 0:
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"all sourcetypes are in alert for this entity, however global entity level max delay allowed and max lag allowed could not be determined, verify TrackMe logs for more information (max_delay_allowed: {global_max_delay_allowed} seconds, duration: {convert_seconds_to_duration(global_max_delay_allowed)}, last_event_lag: {global_last_event_lag} seconds, duration: {convert_seconds_to_duration(global_last_event_lag)})"
|
|
)
|
|
|
|
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
|
|
if object_state == "red" and isUnderLogicalGroup is True:
|
|
if LogicalGroupStateInAlert is False:
|
|
object_state = "blue"
|
|
|
|
# if object_state is not red or blue but isFuture is True, then object_state is orange
|
|
if object_state not in ["red", "blue"]:
|
|
if isFuture is True:
|
|
object_state = "orange"
|
|
|
|
# if object_state is red but if isUnderMonitoring is False, then object_state is orange
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
object_state = "orange"
|
|
|
|
#
|
|
# Hybrid scoring: Apply score-based logic
|
|
# Outliers are handled separately via score_outliers in get_outliers_status
|
|
#
|
|
total_score = None
|
|
score_definition = {}
|
|
if score is not None:
|
|
# Calculate total score with static increments for anomalies
|
|
base_score = float(score) if score is not None else 0.0
|
|
total_score = base_score
|
|
|
|
# Build score definition to track where the score comes from
|
|
# Convert base_score to integer if it's a whole number, otherwise keep as float
|
|
if base_score == int(base_score):
|
|
score_definition["base_score"] = int(base_score)
|
|
else:
|
|
score_definition["base_score"] = base_score
|
|
score_definition["components"] = []
|
|
|
|
# Add static increments for each anomaly type (using VT-specific impact scores)
|
|
if isUnderDelayAlert is True:
|
|
increment = get_entity_impact_score(record, "dhm", "delay", vtenant_account, 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "delay_threshold_breach",
|
|
"score": increment,
|
|
"description": "Delay threshold breached"
|
|
})
|
|
|
|
if isUnderLatencyAlert is True:
|
|
increment = get_entity_impact_score(record, "dhm", "latency", vtenant_account, 48)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "latency_threshold_breach",
|
|
"score": increment,
|
|
"description": "Latency threshold breached"
|
|
})
|
|
|
|
if isFuture is True:
|
|
increment = get_impact_score(vtenant_account, "impact_score_dhm_future_tolerance_breach", 36)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "future_tolerance_breach",
|
|
"score": increment,
|
|
"description": "Future tolerance breached"
|
|
})
|
|
|
|
# Add outlier score if present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_definition["score_outliers"] = float(score_outliers)
|
|
|
|
# Add score sources if available
|
|
score_source = record.get("score_source", [])
|
|
if score_source:
|
|
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
|
|
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Apply score-based logic:
|
|
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
|
|
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
|
|
# - If total_score == 0: keep current state
|
|
|
|
if total_score >= 100:
|
|
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red (score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
|
|
)
|
|
elif total_score > 0 and total_score < 100:
|
|
# If score > 0 and < 100, entity should be orange (even if currently green)
|
|
if object_state == "green":
|
|
object_state = "orange"
|
|
# Add status message about score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
# Add outlier context if outliers are present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
|
|
)
|
|
elif object_state == "red":
|
|
# Downgrade red to orange if score < 100
|
|
# Only apply score-based downgrade if the red state is NOT due to outliers
|
|
# (outliers with score_outliers >= 100 should still be red)
|
|
if isOutlier != 1:
|
|
object_state = "orange"
|
|
# Add status message about score when downgrading
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
|
|
)
|
|
else:
|
|
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
|
|
# in get_outliers_status, so we can still apply score-based logic
|
|
if score_outliers is not None and score_outliers < 100:
|
|
object_state = "orange"
|
|
# Add status message about score when downgrading due to low outlier score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'downgrading red to orange (outlier score too low)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
|
|
)
|
|
else:
|
|
# total_score == 0 or total_score <= 0
|
|
# Check if score is 0 due to false_positive (global false positive, not just outliers)
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive:
|
|
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
|
|
object_state = "green"
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_source="{score_source}", '
|
|
f'setting state to green (false positive set, score cancelled)'
|
|
)
|
|
elif score_outliers is not None and score_outliers <= 0:
|
|
# Check if there are any other issues
|
|
has_other_issues = (
|
|
isUnderLatencyAlert is True
|
|
or isUnderDelayAlert is True
|
|
)
|
|
if not has_other_issues:
|
|
# Outliers are suppressed (false positive), and no other issues, set to green
|
|
object_state = "green"
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'setting state to green (outliers suppressed, no other issues)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'keeping current state (score == 0, but other issues present)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_dhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'keeping current state (score == 0)'
|
|
)
|
|
|
|
# define anomaly_reason
|
|
if object_state == "green":
|
|
status_message.append(isUnderDelayMessage)
|
|
status_message.append(isUnderLatencyMessage)
|
|
|
|
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive and score_definition and "components" in score_definition:
|
|
# Extract anomaly reasons from score_definition components
|
|
for component in score_definition.get("components", []):
|
|
component_type = component.get("type")
|
|
if component_type:
|
|
mapped_reason = get_anomaly_reason_from_component_type(component_type)
|
|
if mapped_reason and mapped_reason not in anomaly_reason:
|
|
anomaly_reason.append(mapped_reason)
|
|
# If no components found, still add "none"
|
|
if not anomaly_reason:
|
|
anomaly_reason.append("none")
|
|
else:
|
|
anomaly_reason.append("none")
|
|
|
|
# if in a logical group, add the logical group message
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
|
|
else:
|
|
# Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
|
|
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
|
|
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
|
|
outlier_reasons = record.get("isOutlierReason", [])
|
|
if outlier_reasons:
|
|
if isinstance(outlier_reasons, list):
|
|
# Join the list elements into a single string
|
|
outlier_reasons_str = " | ".join(outlier_reasons)
|
|
status_message.append(outlier_reasons_str)
|
|
else:
|
|
# If it's not a list, append it directly
|
|
status_message.append(outlier_reasons)
|
|
# Add ml_outliers_detection to anomaly_reason for all outlier cases
|
|
if "ml_outliers_detection" not in anomaly_reason:
|
|
anomaly_reason.append("ml_outliers_detection")
|
|
|
|
# Add status message for orange state (score_outliers > 0 and < 100)
|
|
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
|
|
base_score = float(score) if score is not None else 0.0
|
|
status_message.append(
|
|
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
|
|
if isFuture is True:
|
|
status_message.append(isFutureMsg)
|
|
anomaly_reason.append("future_over_tolerance")
|
|
|
|
# Monitoring time policy, add the message first then the anomaly reason
|
|
if isUnderMonitoring is False:
|
|
status_message.append(isUnderMonitoringMsg)
|
|
# Use new monitoring anomaly reason if provided
|
|
if monitoring_anomaly_reason:
|
|
anomaly_reason.append(monitoring_anomaly_reason)
|
|
|
|
if isUnderLatencyAlert is True:
|
|
status_message.append(isUnderLatencyMessage)
|
|
anomaly_reason.append("lag_threshold_breached")
|
|
|
|
if isUnderDelayAlert is True:
|
|
status_message.append(isUnderDelayMessage)
|
|
anomaly_reason.append("delay_threshold_breached")
|
|
|
|
# logical group
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
anomaly_reason.append("in_logical_group")
|
|
|
|
# form status_message_json
|
|
status_message_json["status_message"] = status_message
|
|
# deduplicate anomaly_reason
|
|
anomaly_reason = list(set(anomaly_reason))
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
|
|
# Add score information to status_message_json for UI display (sorted alphabetically)
|
|
# Use total_score if calculated (hybrid scoring), otherwise use base score
|
|
if total_score is not None:
|
|
status_message_json["score"] = float(total_score)
|
|
# Update record score to reflect the calculated total_score for UI consistency
|
|
record["score"] = float(total_score)
|
|
# Add score definition for drilldown modal
|
|
if score_definition:
|
|
status_message_json["score_definition"] = score_definition
|
|
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
|
|
elif score is not None:
|
|
status_message_json["score"] = float(score)
|
|
if score_outliers is not None:
|
|
status_message_json["score_outliers"] = float(score_outliers)
|
|
if total_score is not None:
|
|
status_message_json["total_score"] = float(total_score)
|
|
|
|
# get disruption_duration
|
|
if not disruption_queue_record:
|
|
record["disruption_min_time_sec"] = 0
|
|
|
|
else:
|
|
logger.debug(
|
|
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
|
|
)
|
|
|
|
disruption_object_state = disruption_queue_record.get("object_state", "green")
|
|
try:
|
|
disruption_min_time_sec = int(
|
|
disruption_queue_record.get("disruption_min_time_sec", 0)
|
|
)
|
|
except:
|
|
disruption_min_time_sec = 0
|
|
# add to the record
|
|
record["disruption_min_time_sec"] = disruption_min_time_sec
|
|
|
|
try:
|
|
disruption_start_epoch = float(
|
|
disruption_queue_record.get("disruption_start_epoch", 0)
|
|
)
|
|
except:
|
|
disruption_start_epoch = 0
|
|
|
|
# Case 1: Entity is no longer in alert state (not red)
|
|
if object_state != "red":
|
|
# Only update if we were previously tracking a disruption
|
|
if disruption_object_state == "red":
|
|
disruption_queue_record["object_state"] = object_state
|
|
disruption_queue_record["disruption_start_epoch"] = 0
|
|
disruption_queue_record["mtime"] = time.time()
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
return (
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
splk_dhm_alerting_policy,
|
|
)
|
|
|
|
# Case 2: Entity is in alert state (red)
|
|
if object_state == "red":
|
|
current_time = time.time()
|
|
|
|
# If this is a new disruption, start tracking it
|
|
if disruption_object_state != "red":
|
|
disruption_queue_record["object_state"] = "red"
|
|
disruption_queue_record["disruption_start_epoch"] = current_time
|
|
disruption_queue_record["mtime"] = current_time
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
|
|
# For new disruptions, if min time is set, show as blue with message
|
|
if disruption_min_time_sec > 0:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
return (
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
splk_dhm_alerting_policy,
|
|
)
|
|
|
|
# If we're already tracking a disruption, check duration
|
|
if disruption_min_time_sec > 0:
|
|
try:
|
|
disruption_duration = current_time - disruption_start_epoch
|
|
except Exception as e:
|
|
logger.error(f"error calculating disruption_duration: {e}")
|
|
disruption_duration = 0
|
|
|
|
# If duration hasn't breached threshold, show as blue with message
|
|
if disruption_duration < disruption_min_time_sec:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
|
|
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
|
|
# return
|
|
logging.debug(
|
|
f'set_dhm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
return (
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
splk_dhm_alerting_policy,
|
|
)
|
|
|
|
|
|
def set_mhm_status(
|
|
logger,
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record,
|
|
metric_details,
|
|
isFuture,
|
|
isFutureMsg,
|
|
object_logical_group_dict,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler=None,
|
|
score=None,
|
|
score_outliers=None,
|
|
vtenant_account=None,
|
|
):
|
|
"""
|
|
Create a function called set_mhm_status:
|
|
- arguments: record, isFuture, isFutureMsg, isUnderLogicalGroup, LogicalGroupStateInAlert
|
|
- returns:
|
|
object_state (string): blue, orange, green, red
|
|
anomaly_reason (list): list of short code reasons why the object is in anomaly
|
|
status_message (list): list of long description reasons why the object is in anomaly
|
|
- behaviour:
|
|
object_state:
|
|
green if:
|
|
all metric caterogies are green
|
|
blue if:
|
|
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
|
|
orange if:
|
|
All green conditions are met except for isFuture which would be True
|
|
red if:
|
|
Any of the green conditions are not met, and blue conditions and orange conditions are not met
|
|
anomaly_reason:
|
|
if object_state is green, anomnaly_reason is None
|
|
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
|
|
"""
|
|
|
|
# init status_message and anomaly_reason
|
|
status_message = []
|
|
anomaly_reason = []
|
|
|
|
# init status_message_json
|
|
status_message_json = {}
|
|
|
|
# define object_state
|
|
object_state = "green"
|
|
|
|
#
|
|
# Logical group management
|
|
#
|
|
|
|
(
|
|
isUnderLogicalGroup,
|
|
LogicalGroupStateInAlert,
|
|
LogicalGroupMsg,
|
|
) = get_and_manage_logical_group_status(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record.get("object"),
|
|
object_state,
|
|
record.get("object_group_key"),
|
|
object_logical_group_dict,
|
|
)
|
|
|
|
# log debug
|
|
logging.debug(
|
|
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
|
|
)
|
|
|
|
# Convert metric_details to a list if it is a string
|
|
if isinstance(metric_details, str):
|
|
metric_details = [metric_details]
|
|
|
|
# counters
|
|
count_red = 0
|
|
count_green = 0
|
|
metrics_red_list = []
|
|
|
|
# splk_dhm_st_summary can actually be a list
|
|
if isinstance(metric_details, list):
|
|
for item_str in metric_details:
|
|
try:
|
|
new_dict = ast.literal_eval(item_str)
|
|
|
|
# Iterate through the inner dictionaries
|
|
for inner_dict in new_dict.values():
|
|
if inner_dict.get("state") == "red":
|
|
count_red += 1
|
|
anomaly_reason.append("delay_threshold_breached")
|
|
metrics_red_list.append(
|
|
f'(idx: {inner_dict.get("idx")}, metrics: {inner_dict.get("metric_category")}, anomaly_reason: delay_threshold_breached)'
|
|
)
|
|
|
|
elif inner_dict.get("state") == "green":
|
|
count_green += 1
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f"Error in processing item_str: {item_str}. Error: {str(e)}"
|
|
)
|
|
|
|
logging.debug(
|
|
f'object="{record.get("object")}", count_red={count_red}, count_green={count_green}'
|
|
)
|
|
|
|
# turn metrics_red_list into a pipe separated string
|
|
metrics_red_list = "|".join(metrics_red_list)
|
|
|
|
# Decision making based on the counts of red and green states
|
|
if count_red > 0:
|
|
object_state = "red"
|
|
status_message.append(
|
|
f"One or more metric categories are in alert for this entity, metrics in alert: {metrics_red_list}"
|
|
)
|
|
else:
|
|
# Use object_state as it is
|
|
pass
|
|
|
|
# if all metrics are in alert, then object_state is red
|
|
if count_green == 0:
|
|
object_state = "red"
|
|
status_message.append("all metric categories are in alert for this entity")
|
|
|
|
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
|
|
if object_state == "red" and isUnderLogicalGroup is True:
|
|
if LogicalGroupStateInAlert is False:
|
|
object_state = "blue"
|
|
|
|
# if object_state is not red or blue but isFuture is True, then object_state is orange
|
|
if object_state not in ["red", "blue"]:
|
|
if isFuture is True:
|
|
object_state = "orange"
|
|
|
|
#
|
|
# Hybrid scoring: Apply score-based logic
|
|
# MHM doesn't have outliers, only future tolerance and metric alerts
|
|
#
|
|
total_score = None
|
|
score_definition = {}
|
|
if score is not None:
|
|
# Calculate total score with static increments for anomalies
|
|
base_score = float(score) if score is not None else 0.0
|
|
total_score = base_score
|
|
|
|
# Build score definition to track where the score comes from
|
|
# Convert base_score to integer if it's a whole number, otherwise keep as float
|
|
if base_score == int(base_score):
|
|
score_definition["base_score"] = int(base_score)
|
|
else:
|
|
score_definition["base_score"] = base_score
|
|
score_definition["components"] = []
|
|
|
|
# Add static increments for each anomaly type (using VT-specific impact scores)
|
|
if count_red > 0:
|
|
increment = get_impact_score(vtenant_account, "impact_score_mhm_metric_alert", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "metric_alert",
|
|
"score": increment,
|
|
"description": "One or more metric categories in alert"
|
|
})
|
|
|
|
if isFuture is True:
|
|
increment = get_impact_score(vtenant_account, "impact_score_mhm_future_tolerance_breach", 36)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "future_tolerance_breach",
|
|
"score": increment,
|
|
"description": "Future tolerance breached"
|
|
})
|
|
|
|
# Add outlier score if present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_definition["score_outliers"] = float(score_outliers)
|
|
|
|
# Add score sources if available
|
|
score_source = record.get("score_source", [])
|
|
if score_source:
|
|
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
|
|
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Apply score-based logic:
|
|
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
|
|
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
|
|
# - If total_score == 0: keep current state
|
|
|
|
if total_score >= 100:
|
|
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red (score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
|
|
)
|
|
elif total_score > 0 and total_score < 100:
|
|
# If score > 0 and < 100, entity should be orange (even if currently green)
|
|
if object_state == "green":
|
|
object_state = "orange"
|
|
# Add status message about score
|
|
status_message.append(
|
|
f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
|
|
)
|
|
elif object_state == "red":
|
|
# Downgrade red to orange if score < 100
|
|
object_state = "orange"
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", downgrading red to orange'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
|
|
)
|
|
else:
|
|
# total_score == 0 or total_score <= 0
|
|
# Check if score is 0 due to false_positive (global false positive, not just outliers)
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive:
|
|
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
|
|
object_state = "green"
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_source="{score_source}", '
|
|
f'setting state to green (false positive set, score cancelled)'
|
|
)
|
|
else:
|
|
# total_score == 0, keep current state
|
|
logging.debug(
|
|
f'set_mhm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping current state (score == 0)'
|
|
)
|
|
|
|
# define anomaly_reason
|
|
if object_state == "green":
|
|
status_message.append(
|
|
"All metric categories are in normal state for this entity"
|
|
)
|
|
|
|
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive and score_definition and "components" in score_definition:
|
|
# Extract anomaly reasons from score_definition components
|
|
for component in score_definition.get("components", []):
|
|
component_type = component.get("type")
|
|
if component_type:
|
|
mapped_reason = get_anomaly_reason_from_component_type(component_type)
|
|
if mapped_reason and mapped_reason not in anomaly_reason:
|
|
anomaly_reason.append(mapped_reason)
|
|
# If no components found, still add "none"
|
|
if not anomaly_reason:
|
|
anomaly_reason.append("none")
|
|
else:
|
|
anomaly_reason.append("none")
|
|
|
|
# if in a logical group, add the logical group message
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
|
|
else:
|
|
if isFuture is True:
|
|
status_message.append(isFutureMsg)
|
|
anomaly_reason.append("future_over_tolerance")
|
|
|
|
# logical group
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
anomaly_reason.append("in_logical_group")
|
|
|
|
# Deduplicate anomaly_reason before setting it in status_message_json
|
|
# This prevents duplicates (e.g., delay_threshold_breached added multiple times for multiple metrics)
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
# deduplicate anomaly_reason to avoid duplicates
|
|
anomaly_reason = list(set(anomaly_reason))
|
|
|
|
# form status_message_json
|
|
status_message_json["status_message"] = status_message
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
|
|
# Add score information to status_message_json for UI display (sorted alphabetically)
|
|
# Use total_score if calculated (hybrid scoring), otherwise use base score
|
|
if total_score is not None:
|
|
status_message_json["score"] = float(total_score)
|
|
# Update record score to reflect the calculated total_score for UI consistency
|
|
record["score"] = float(total_score)
|
|
# Add score definition for drilldown modal
|
|
if score_definition:
|
|
status_message_json["score_definition"] = score_definition
|
|
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
|
|
elif score is not None:
|
|
status_message_json["score"] = float(score)
|
|
if score_outliers is not None:
|
|
status_message_json["score_outliers"] = float(score_outliers)
|
|
if total_score is not None:
|
|
status_message_json["total_score"] = float(total_score)
|
|
|
|
# get disruption_duration
|
|
if not disruption_queue_record:
|
|
record["disruption_min_time_sec"] = 0
|
|
|
|
else:
|
|
|
|
logger.debug(
|
|
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
|
|
)
|
|
|
|
disruption_object_state = disruption_queue_record.get("object_state", "green")
|
|
try:
|
|
disruption_min_time_sec = int(
|
|
disruption_queue_record.get("disruption_min_time_sec", 0)
|
|
)
|
|
except:
|
|
disruption_min_time_sec = 0
|
|
# add to the record
|
|
record["disruption_min_time_sec"] = disruption_min_time_sec
|
|
|
|
try:
|
|
disruption_start_epoch = float(
|
|
disruption_queue_record.get("disruption_start_epoch", 0)
|
|
)
|
|
except:
|
|
disruption_start_epoch = 0
|
|
|
|
# Case 1: Entity is no longer in alert state (not red)
|
|
if object_state != "red":
|
|
# Only update if we were previously tracking a disruption
|
|
if disruption_object_state == "red":
|
|
disruption_queue_record["object_state"] = object_state
|
|
disruption_queue_record["disruption_start_epoch"] = 0
|
|
disruption_queue_record["mtime"] = time.time()
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# Case 2: Entity is in alert state (red)
|
|
if object_state == "red":
|
|
current_time = time.time()
|
|
|
|
# If this is a new disruption, start tracking it
|
|
if disruption_object_state != "red":
|
|
disruption_queue_record["object_state"] = "red"
|
|
disruption_queue_record["disruption_start_epoch"] = current_time
|
|
disruption_queue_record["mtime"] = current_time
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
|
|
# For new disruptions, if min time is set, show as blue with message
|
|
if disruption_min_time_sec > 0:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# If we're already tracking a disruption, check duration
|
|
if disruption_min_time_sec > 0:
|
|
try:
|
|
disruption_duration = current_time - disruption_start_epoch
|
|
except Exception as e:
|
|
logger.error(f"error calculating disruption_duration: {e}")
|
|
disruption_duration = 0
|
|
|
|
# If duration hasn't breached threshold, show as blue with message
|
|
if disruption_duration < disruption_min_time_sec:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
|
|
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
|
|
# Also ensure status_message_json is updated with deduplicated list (safety check)
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
# deduplicate anomaly_reason to avoid duplicates (e.g., delay_threshold_breached added multiple times for multiple metrics)
|
|
anomaly_reason = list(set(anomaly_reason))
|
|
# Update status_message_json to ensure it has the deduplicated list
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
|
|
# return
|
|
logging.debug(
|
|
f'set_mhm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
|
|
def set_flx_status(
|
|
logger,
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
object_logical_group_dict,
|
|
threshold_alert,
|
|
threshold_messages,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler=None,
|
|
monitoring_anomaly_reason=None,
|
|
score=None,
|
|
score_outliers=None,
|
|
threshold_scores=None,
|
|
vtenant_account=None,
|
|
):
|
|
"""
|
|
Create a function called set_flx_status:
|
|
- arguments: record, isOutlier, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
|
|
- returns:
|
|
object_state (string): blue, orange, green, red
|
|
anomaly_reason (list): list of short code reasons why the object is in anomaly
|
|
status_message (list): list of long description reasons why the object is in anomaly
|
|
- behaviour:
|
|
object_state:
|
|
green if:
|
|
isOutlier is 1
|
|
isFuture is False
|
|
isUnderMonitoring is True
|
|
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
|
|
isUnderLatencyAlert is False
|
|
isUnderDelayAlert is False
|
|
blue if:
|
|
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
|
|
orange if:
|
|
All green conditions are met except for isFuture which would be True
|
|
red if:
|
|
Any of the green conditions are not met, and blue conditions and orange conditions are not met
|
|
anomaly_reason:
|
|
if object_state is green, anomnaly_reason is None
|
|
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
|
|
"""
|
|
|
|
# init status_message and anomaly_reason
|
|
status_message = []
|
|
anomaly_reason = []
|
|
|
|
# upstream anomaly_reason
|
|
upstream_anomaly_reason = record.get("anomaly_reason", [])
|
|
if isinstance(upstream_anomaly_reason, str):
|
|
upstream_anomaly_reason = [upstream_anomaly_reason]
|
|
|
|
# init status_message_json
|
|
status_message_json = {}
|
|
|
|
# status and status_description are used to compose the anomaly_reason
|
|
status = record.get("status", "unknown")
|
|
try:
|
|
status = int(status)
|
|
except Exception as e:
|
|
pass
|
|
status_description = record.get("status_description", "unknown")
|
|
|
|
# Capture original upstream status from the search before any modifications
|
|
# In real-time processing, check metrics.status first (original search status)
|
|
# If not available, fall back to the current status field
|
|
original_upstream_status = None
|
|
metrics = record.get("metrics", {})
|
|
if isinstance(metrics, str):
|
|
try:
|
|
metrics = json.loads(metrics)
|
|
except Exception:
|
|
metrics = {}
|
|
if isinstance(metrics, dict) and "status" in metrics:
|
|
try:
|
|
original_upstream_status = int(metrics["status"])
|
|
except Exception:
|
|
pass
|
|
|
|
# If metrics.status not available, use the current status field as fallback
|
|
if original_upstream_status is None:
|
|
original_upstream_status = status
|
|
|
|
# for flx, object_state can be defined upstream based on the status
|
|
object_state = "unknown"
|
|
if status == 1:
|
|
object_state = "green"
|
|
elif status == 2:
|
|
object_state = "red"
|
|
elif status == 3:
|
|
object_state = "orange"
|
|
else:
|
|
pass
|
|
|
|
# for flx, attempt to retrieve extra_attributes, if present attempt to load as an object
|
|
extra_attributes = record.get("extra_attributes", {})
|
|
if isinstance(extra_attributes, str):
|
|
if len(extra_attributes) > 0:
|
|
try:
|
|
extra_attributes = json.loads(extra_attributes)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Error in processing extra_attributes: {extra_attributes}. Error: {str(e)}"
|
|
)
|
|
else:
|
|
extra_attributes = {}
|
|
|
|
# if source_handler is not trackmedecisionmaker, consider the upstream status as the source of truth
|
|
if source_handler == "trackmedecisionmaker":
|
|
if status != 1 and not (
|
|
len(upstream_anomaly_reason) == 1
|
|
and upstream_anomaly_reason[0] == "inactive"
|
|
):
|
|
if "status_not_met" not in upstream_anomaly_reason:
|
|
upstream_anomaly_reason.append("status_not_met")
|
|
|
|
logging.debug(
|
|
f'source_handler="{source_handler}", entering set_flx_status, object="{record.get("object")}", object_state="{object_state}", status="{status}", upstream_anomaly_reason="{upstream_anomaly_reason}"'
|
|
)
|
|
|
|
#
|
|
# Threshold alert management
|
|
#
|
|
|
|
# if threshold_alert is True, then object_state is red
|
|
record["threshold_alert"] = threshold_alert
|
|
record["threshold_messages"] = threshold_messages
|
|
if threshold_alert == 1:
|
|
object_state = "red"
|
|
status = 2
|
|
anomaly_reason.append("threshold_alert")
|
|
for threshold_message in threshold_messages:
|
|
status_message.append(threshold_message)
|
|
# in record, update status_description and status_description_short with a CSV string of the threshold_messages
|
|
record["status_description"] = ",".join(threshold_messages)
|
|
record["status_description_short"] = ",".join(threshold_messages)
|
|
|
|
else:
|
|
|
|
# remove threshold_alert from upstream_anomaly_reason, if present
|
|
if "threshold_alert" in upstream_anomaly_reason:
|
|
upstream_anomaly_reason.remove("threshold_alert")
|
|
|
|
# if the unique anomaly reason was threshold_alert, then object_state is green
|
|
# BUT only if original_upstream_status is 1 (good status)
|
|
# If original_upstream_status != 1 (status_not_met), we should keep the red/orange state
|
|
if len(upstream_anomaly_reason) == 0 and original_upstream_status == 1:
|
|
object_state = "green"
|
|
status = 1
|
|
|
|
#
|
|
# Logical group management
|
|
#
|
|
|
|
(
|
|
isUnderLogicalGroup,
|
|
LogicalGroupStateInAlert,
|
|
LogicalGroupMsg,
|
|
) = get_and_manage_logical_group_status(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record.get("object"),
|
|
object_state,
|
|
record.get("object_group_key"),
|
|
object_logical_group_dict,
|
|
)
|
|
|
|
# log debug
|
|
logger.debug(
|
|
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
|
|
)
|
|
|
|
# get status_description_short and ensures it always has a value
|
|
status_description_short = record.get("status_description_short", None)
|
|
if not status_description_short:
|
|
record["status_description_short"] = status_description
|
|
status_description_short = status_description
|
|
|
|
# Verify isOutlier
|
|
# Only set red if isOutlier == 1 AND score_outliers > 0 (or score_outliers is None for legacy)
|
|
# If score_outliers <= 0, outliers are suppressed (false positive) and should not cause red state
|
|
if isOutlier == 1:
|
|
if score_outliers is not None:
|
|
if score_outliers > 0:
|
|
# Outliers present with positive score
|
|
if score_outliers >= 100:
|
|
object_state = "red"
|
|
status = 2
|
|
else:
|
|
# score_outliers > 0 and < 100, set to orange
|
|
object_state = "orange"
|
|
status = 3
|
|
# If score_outliers <= 0, don't set state to red/orange (outliers suppressed)
|
|
else:
|
|
# Legacy behavior: if score_outliers is not provided, use isOutlier
|
|
object_state = "red"
|
|
status = 2
|
|
else:
|
|
pass
|
|
|
|
# if object_state is red but isUnderMonitoring is False, then object_state is orange
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
object_state = "orange"
|
|
status = 3
|
|
|
|
#
|
|
# Hybrid scoring: Apply score-based logic
|
|
# Outliers are handled separately via score_outliers in get_outliers_status
|
|
#
|
|
total_score = None
|
|
score_definition = {}
|
|
if score is not None:
|
|
# Calculate total score with static increments for anomalies
|
|
base_score = float(score) if score is not None else 0.0
|
|
total_score = base_score
|
|
|
|
# Build score definition to track where the score comes from
|
|
# Convert base_score to integer if it's a whole number, otherwise keep as float
|
|
if base_score == int(base_score):
|
|
score_definition["base_score"] = int(base_score)
|
|
else:
|
|
score_definition["base_score"] = base_score
|
|
score_definition["components"] = []
|
|
|
|
# Add static increments for each anomaly type
|
|
if threshold_alert == 1:
|
|
# Use threshold scores if provided, otherwise default to 100
|
|
if threshold_scores and len(threshold_scores) > 0:
|
|
# Sum all threshold scores (multiple thresholds can be breached)
|
|
increment = sum(threshold_scores)
|
|
else:
|
|
# Default to 100 for backward compatibility
|
|
increment = 100
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "threshold_breach",
|
|
"score": increment,
|
|
"description": "Threshold alert breached"
|
|
})
|
|
|
|
# Add outlier score if present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_definition["score_outliers"] = float(score_outliers)
|
|
|
|
# Add score sources if available
|
|
score_source = record.get("score_source", [])
|
|
if score_source:
|
|
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
|
|
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Apply score-based logic:
|
|
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
|
|
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
|
|
# - If total_score == 0: keep current state
|
|
|
|
if total_score >= 100:
|
|
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
status = 2
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red (score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
|
|
)
|
|
elif total_score > 0 and total_score < 100:
|
|
# If score > 0 and < 100, entity should be orange (even if currently green)
|
|
if object_state == "green":
|
|
object_state = "orange"
|
|
status = 3
|
|
# Add status message about score
|
|
status_message.append(
|
|
f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
|
|
)
|
|
elif object_state == "red":
|
|
# Downgrade red to orange if score < 100
|
|
# Only apply score-based downgrade if the red state is NOT due to outliers
|
|
# (outliers with score_outliers >= 100 should still be red)
|
|
if isOutlier != 1:
|
|
object_state = "orange"
|
|
status = 3
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
|
|
)
|
|
else:
|
|
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
|
|
# in get_outliers_status, so we can still apply score-based logic
|
|
if score_outliers is not None and score_outliers < 100:
|
|
object_state = "orange"
|
|
status = 3
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'downgrading red to orange (outlier score too low)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
|
|
)
|
|
else:
|
|
# total_score == 0 or total_score <= 0
|
|
# Check if score is 0 due to false_positive (global false positive, not just outliers)
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive:
|
|
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
|
|
object_state = "green"
|
|
status = 1
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_source="{score_source}", '
|
|
f'setting state to green (false positive set, score cancelled)'
|
|
)
|
|
elif score_outliers is not None and score_outliers <= 0 and threshold_alert != 1 and original_upstream_status == 1:
|
|
# Outliers are suppressed (false positive), and no other issues, set to green
|
|
# BUT only if original_upstream_status is 1 (good status)
|
|
# If original_upstream_status != 1 (status_not_met), we should keep the red/orange state
|
|
object_state = "green"
|
|
status = 1
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'original_upstream_status="{original_upstream_status}", '
|
|
f'setting state to green (outliers suppressed, no other issues, upstream status is good)'
|
|
)
|
|
else:
|
|
# Keep current state if there are other issues or legacy behavior
|
|
# Also keep current state if original_upstream_status != 1 (status_not_met)
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'threshold_alert="{threshold_alert}", original_upstream_status="{original_upstream_status}", '
|
|
f'keeping current state (score == 0, may have upstream status issues)'
|
|
)
|
|
|
|
# Safeguard: If original_upstream_status == 2 (status_not_met), ensure state is not green
|
|
# (except for false_positive which is an explicit override)
|
|
if original_upstream_status == 2 and object_state == "green":
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
if not has_false_positive:
|
|
# Restore red state if upstream status indicates status_not_met
|
|
object_state = "red"
|
|
status = 2
|
|
logging.debug(
|
|
f'set_flx_status, safeguard: object="{record.get("object")}", '
|
|
f'original_upstream_status="{original_upstream_status}", '
|
|
f'correcting green state back to red (status_not_met detected)'
|
|
)
|
|
|
|
# define anomaly_reason
|
|
if object_state == "green":
|
|
status_message_str = f"The entity status is complying with monitoring rules (status: {status}, status_description: {status_description})"
|
|
status_message.append(status_message_str)
|
|
|
|
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive and score_definition and "components" in score_definition:
|
|
# Extract anomaly reasons from score_definition components
|
|
for component in score_definition.get("components", []):
|
|
component_type = component.get("type")
|
|
if component_type:
|
|
mapped_reason = get_anomaly_reason_from_component_type(component_type)
|
|
if mapped_reason and mapped_reason not in anomaly_reason:
|
|
anomaly_reason.append(mapped_reason)
|
|
# If no components found, still add "none"
|
|
if not anomaly_reason:
|
|
anomaly_reason.append("none")
|
|
else:
|
|
anomaly_reason.append("none")
|
|
|
|
# if in a logical group, add the logical group message
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
|
|
#
|
|
# Inactive entities management
|
|
#
|
|
|
|
# get max_sec_inactive
|
|
max_sec_inactive = record.get("max_sec_inactive", 0)
|
|
try:
|
|
max_sec_inactive = int(max_sec_inactive)
|
|
except Exception as e:
|
|
max_sec_inactive = 0
|
|
|
|
# get the age in seconds since the latest execution
|
|
sec_since_last_execution = round(
|
|
time.time() - float(record.get("tracker_runtime")), 0
|
|
)
|
|
duration_since_last_execution = convert_seconds_to_duration(
|
|
sec_since_last_execution
|
|
)
|
|
|
|
# Check and act
|
|
if float(sec_since_last_execution) > max_sec_inactive and max_sec_inactive > 0:
|
|
status_message_str = f"This entity has been inactive for more than {duration_since_last_execution} (D+HH:MM:SS) and was not actively managed by any tracker, its status was updated automatically by the inactive entities tracker"
|
|
status_message = [status_message_str]
|
|
status_description_short = "entity is red due to inactivity"
|
|
status_description = f"The entity status is red due to inactivity, it was not actively managed by any tracker for more than {duration_since_last_execution} (D+HH:MM:SS)"
|
|
anomaly_reason = ["inactive"]
|
|
object_state = "red"
|
|
status = 2
|
|
# in this case, we need to update the status_description and status_description_short
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["object_state"] = object_state
|
|
# Add score increment for inactive if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_flx_inactive", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "inactive",
|
|
"score": increment,
|
|
"description": "Entity inactive"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
#
|
|
# end of inactive entities management
|
|
#
|
|
|
|
#
|
|
# Red status due to upstream Flex logic / Orange state
|
|
#
|
|
|
|
# Only add status_not_met if the original upstream status from the search indicates a problem
|
|
# (status != 1), not when we're orange/red purely due to score-based logic or outliers
|
|
# status_not_met should only be driven by the hybrid tracker search itself
|
|
# Don't add status_not_met if:
|
|
# 1. Entity is orange/red ONLY due to outliers (regardless of score)
|
|
# 2. Outliers score >= 100 (entity already red due to outliers)
|
|
# 3. Entity is orange/red ONLY due to threshold breaches with score < 100
|
|
# Check if entity is orange/red ONLY due to outliers
|
|
has_outliers_only = (
|
|
isOutlier == 1
|
|
and score_outliers is not None
|
|
and not threshold_alert
|
|
)
|
|
|
|
# Check if outliers score is >= 100 (entity already red due to outliers)
|
|
outliers_score_high = (
|
|
isOutlier == 1
|
|
and score_outliers is not None
|
|
and score_outliers >= 100
|
|
)
|
|
|
|
# Check if entity is orange/red ONLY due to threshold breaches with score < 100
|
|
# Calculate threshold score to check if it's < 100
|
|
threshold_score_sum = 0
|
|
if threshold_alert == 1 and threshold_scores and len(threshold_scores) > 0:
|
|
threshold_score_sum = sum(threshold_scores)
|
|
elif threshold_alert == 1:
|
|
threshold_score_sum = 100 # Default score if threshold_scores not provided
|
|
|
|
has_threshold_only_low_score = (
|
|
threshold_alert == 1
|
|
and isOutlier != 1
|
|
and threshold_score_sum < 100
|
|
)
|
|
|
|
# Only add status_not_met if:
|
|
# - Original upstream status was bad (status != 1) AND
|
|
# - Entity is in non-green state AND
|
|
# - Entity is NOT orange/red ONLY due to outliers (any score) AND
|
|
# - Outliers score is NOT >= 100 (don't add if outliers already made it red) AND
|
|
# - Entity is NOT orange/red ONLY due to threshold breaches with score < 100
|
|
if ((object_state == "red" and not threshold_alert) or object_state == "orange") and original_upstream_status != 1 and not has_outliers_only and not outliers_score_high and not has_threshold_only_low_score:
|
|
status_message_str = f"The entity status is not complying with monitoring rules (status: {status}, status_description: {status_description})"
|
|
status_message.append(status_message_str)
|
|
anomaly_reason.append("status_not_met")
|
|
# Add score increment for status_not_met if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_flx_status_not_met", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "status_not_met",
|
|
"score": increment,
|
|
"description": "Status not met"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
# Re-check score-based logic after adding status_not_met score
|
|
# If total_score >= 100, ensure entity is red (unless it's blue due to logical group)
|
|
if total_score >= 100:
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
status = 2
|
|
logging.debug(
|
|
f'set_flx_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red after status_not_met (score >= 100)'
|
|
)
|
|
|
|
# Other statements
|
|
# Check for outliers: report isOutlier in status_message for both red and orange states
|
|
# Add ml_outliers_detection to anomaly_reason for all outlier cases
|
|
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
|
|
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
|
|
outlier_reasons = record.get("isOutlierReason", [])
|
|
if outlier_reasons:
|
|
if isinstance(outlier_reasons, list):
|
|
# Join the list elements into a single string
|
|
outlier_reasons_str = " | ".join(outlier_reasons)
|
|
status_message.append(outlier_reasons_str)
|
|
else:
|
|
# If it's not a list, append it directly
|
|
status_message.append(outlier_reasons)
|
|
|
|
# Add ml_outliers_detection to anomaly_reason for all outlier cases
|
|
if "ml_outliers_detection" not in anomaly_reason:
|
|
anomaly_reason.append("ml_outliers_detection")
|
|
|
|
# Add status message for orange state (score_outliers > 0 and < 100)
|
|
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
|
|
base_score = float(score) if score is not None else 0.0
|
|
status_message.append(
|
|
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
|
|
if object_state == "red":
|
|
|
|
# Monitoring time policy, add the message first then the anomaly reason
|
|
if isUnderMonitoring is False:
|
|
status_message.append(isUnderMonitoringMsg)
|
|
# Use new monitoring anomaly reason if provided
|
|
if monitoring_anomaly_reason:
|
|
anomaly_reason.append(monitoring_anomaly_reason)
|
|
else:
|
|
anomaly_reason.append("out_of_monitoring_times")
|
|
# Note: out_of_monitoring_times is not scored as an anomaly - it's a protective mechanism
|
|
# that prevents entities from turning red when outside their monitoring window
|
|
|
|
# logical group
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
anomaly_reason.append("in_logical_group")
|
|
# Note: in_logical_group is not scored as an anomaly - it's a protective mechanism
|
|
# that prevents entities from turning red when the logical group is compliant
|
|
|
|
#
|
|
# Logical group management (object_state is red but in a logical group which is not in alert)
|
|
#
|
|
|
|
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
|
|
if object_state == "red" and isUnderLogicalGroup is True:
|
|
if LogicalGroupStateInAlert is False:
|
|
object_state = "blue"
|
|
status = 3
|
|
#
|
|
# Out of monitoring days and hours management
|
|
#
|
|
|
|
# if object_state is red but isUnderMonitoring is False, then object_state is orange
|
|
# However, if total_score >= 100, keep red state (score-based logic takes precedence)
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
# Don't downgrade to orange if score >= 100 (score-based logic takes precedence)
|
|
if total_score is None or total_score < 100:
|
|
object_state = "orange"
|
|
status = 3
|
|
|
|
# update status, object_state, anomaly_reason and metrics
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
record["anomaly_reason"] = anomaly_reason
|
|
|
|
# ensure status metric in metrics is updated
|
|
try:
|
|
metrics_record = record.get("metrics", {})
|
|
if isinstance(metrics_record, str):
|
|
metrics_record = json.loads(metrics_record)
|
|
metrics_record["status"] = status
|
|
record["metrics"] = json.dumps(metrics_record)
|
|
except Exception as e:
|
|
pass
|
|
status_message_json["status_message"] = status_message
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
if extra_attributes:
|
|
status_message_json["extra_attributes"] = extra_attributes
|
|
|
|
# Add score information to status_message_json for UI display (sorted alphabetically)
|
|
# Use total_score if calculated (hybrid scoring), otherwise use base score
|
|
if total_score is not None:
|
|
status_message_json["score"] = float(total_score)
|
|
# Update record score to reflect the calculated total_score for UI consistency
|
|
record["score"] = float(total_score)
|
|
# Add score definition for drilldown modal
|
|
if score_definition:
|
|
status_message_json["score_definition"] = score_definition
|
|
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
|
|
elif score is not None:
|
|
status_message_json["score"] = float(score)
|
|
if score_outliers is not None:
|
|
status_message_json["score_outliers"] = float(score_outliers)
|
|
if total_score is not None:
|
|
status_message_json["total_score"] = float(total_score)
|
|
|
|
# get disruption_duration
|
|
if not disruption_queue_record:
|
|
record["disruption_min_time_sec"] = 0
|
|
|
|
else:
|
|
|
|
logger.debug(
|
|
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
|
|
)
|
|
|
|
disruption_object_state = disruption_queue_record.get("object_state", "green")
|
|
try:
|
|
disruption_min_time_sec = int(
|
|
disruption_queue_record.get("disruption_min_time_sec", 0)
|
|
)
|
|
except:
|
|
disruption_min_time_sec = 0
|
|
# add to the record
|
|
record["disruption_min_time_sec"] = disruption_min_time_sec
|
|
|
|
try:
|
|
disruption_start_epoch = float(
|
|
disruption_queue_record.get("disruption_start_epoch", 0)
|
|
)
|
|
except:
|
|
disruption_start_epoch = 0
|
|
|
|
# Case 1: Entity is no longer in alert state (not red)
|
|
if object_state != "red":
|
|
# Only update if we were previously tracking a disruption
|
|
if disruption_object_state == "red":
|
|
disruption_queue_record["object_state"] = object_state
|
|
disruption_queue_record["disruption_start_epoch"] = 0
|
|
disruption_queue_record["mtime"] = time.time()
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# Case 2: Entity is in alert state (red)
|
|
if object_state == "red":
|
|
current_time = time.time()
|
|
|
|
# If this is a new disruption, start tracking it
|
|
if disruption_object_state != "red":
|
|
disruption_queue_record["object_state"] = "red"
|
|
disruption_queue_record["disruption_start_epoch"] = current_time
|
|
disruption_queue_record["mtime"] = current_time
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
|
|
# For new disruptions, if min time is set, show as blue with message
|
|
if disruption_min_time_sec > 0:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# If we're already tracking a disruption, check duration
|
|
if disruption_min_time_sec > 0:
|
|
try:
|
|
disruption_duration = current_time - disruption_start_epoch
|
|
except Exception as e:
|
|
logger.error(f"error calculating disruption_duration: {e}")
|
|
disruption_duration = 0
|
|
|
|
# If duration hasn't breached threshold, show as blue with message
|
|
if disruption_duration < disruption_min_time_sec:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
|
|
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
|
|
# return
|
|
logging.debug(
|
|
f'set_flx_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
|
|
def set_fqm_status(
|
|
logger,
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
object_logical_group_dict,
|
|
threshold_alert,
|
|
threshold_messages,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler=None,
|
|
monitoring_anomaly_reason=None,
|
|
score=None,
|
|
score_outliers=None,
|
|
threshold_scores=None,
|
|
vtenant_account=None,
|
|
):
|
|
"""
|
|
Create a function called set_fqm_status:
|
|
- arguments: record, isOutlier, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
|
|
- returns:
|
|
object_state (string): blue, orange, green, red
|
|
anomaly_reason (list): list of short code reasons why the object is in anomaly
|
|
status_message (list): list of long description reasons why the object is in anomaly
|
|
- behaviour:
|
|
object_state:
|
|
green if:
|
|
isOutlier is 1
|
|
isFuture is False
|
|
isUnderMonitoring is True
|
|
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
|
|
isUnderLatencyAlert is False
|
|
isUnderDelayAlert is False
|
|
blue if:
|
|
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
|
|
orange if:
|
|
All green conditions are met except for isFuture which would be True
|
|
red if:
|
|
Any of the green conditions are not met, and blue conditions and orange conditions are not met
|
|
anomaly_reason:
|
|
if object_state is green, anomnaly_reason is None
|
|
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
|
|
"""
|
|
|
|
# init status_message and anomaly_reason
|
|
status_message = []
|
|
anomaly_reason = []
|
|
|
|
# get percent_success
|
|
percent_success = record.get("percent_success", None)
|
|
if percent_success is not None:
|
|
percent_success = float(percent_success)
|
|
if percent_success == int(percent_success):
|
|
percent_success = int(percent_success)
|
|
else:
|
|
percent_success = 0
|
|
|
|
# get percent_coverage
|
|
percent_coverage = record.get("percent_coverage", None)
|
|
if percent_coverage is not None:
|
|
percent_coverage = float(percent_coverage)
|
|
if percent_coverage == int(percent_coverage):
|
|
percent_coverage = int(percent_coverage)
|
|
else:
|
|
percent_coverage = 0
|
|
|
|
# get ields_quality_summary JSON, and load as an object
|
|
fields_quality_summary = record.get("fields_quality_summary", {})
|
|
if isinstance(fields_quality_summary, str):
|
|
try:
|
|
fields_quality_summary = json.loads(fields_quality_summary)
|
|
except Exception as e:
|
|
fields_quality_summary = {}
|
|
else:
|
|
fields_quality_summary = {}
|
|
|
|
# get total_fields_passed and total_fields_failed (for the global entity)
|
|
if fields_quality_summary:
|
|
total_fields_passed = fields_quality_summary.get("total_fields_passed", 0)
|
|
if isinstance(total_fields_passed, str):
|
|
try:
|
|
total_fields_passed = int(total_fields_passed)
|
|
except Exception as e:
|
|
total_fields_passed = 0
|
|
total_fields_failed = fields_quality_summary.get("total_fields_failed", 0)
|
|
if isinstance(total_fields_failed, str):
|
|
try:
|
|
total_fields_failed = int(total_fields_failed)
|
|
except Exception as e:
|
|
total_fields_failed = 0
|
|
|
|
# set fqm_type (if @global in object, then fqm_type is global, otherwise it is field)
|
|
fqm_type = "field"
|
|
if "@global" in record.get("object", ""):
|
|
fqm_type = "global"
|
|
|
|
# set object_description
|
|
object_description = {}
|
|
# 1 - try to load the content of fields_quality_summary (JSON as string)
|
|
# 2 - iterate over the JSON and look for fields metadata.*
|
|
# 3 - add them to the record as metadata_<fieldname> (instead of metadata.<fieldname>)
|
|
if "fields_quality_summary" in record:
|
|
try:
|
|
fields_quality_summary = json.loads(record["fields_quality_summary"])
|
|
for field in fields_quality_summary:
|
|
if field.startswith("metadata."):
|
|
newfield_name = field.replace("metadata.", "metadata_")
|
|
object_description[f"{newfield_name}"] = fields_quality_summary[field]
|
|
except:
|
|
pass
|
|
|
|
# add field
|
|
object_description["field"] = record.get('fieldname')
|
|
object_description = json.dumps(object_description, indent=2)
|
|
|
|
record["object_description"] = object_description
|
|
|
|
# init status_message_json
|
|
status_message_json = {}
|
|
|
|
# init status, status_description, status_description_short, object_state
|
|
status = 1
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
|
|
object_state = "green"
|
|
|
|
# mandatorily update the record
|
|
record["status"] = status
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["object_state"] = object_state
|
|
|
|
#
|
|
# Threshold alert management
|
|
#
|
|
|
|
# In fqm. the threshold is mandatory and the root logic of the detection
|
|
record["threshold_alert"] = threshold_alert
|
|
record["threshold_messages"] = threshold_messages
|
|
if threshold_alert == 1:
|
|
object_state = "red"
|
|
status = 2
|
|
anomaly_reason.append("threshold_alert")
|
|
for threshold_message in threshold_messages:
|
|
status_message.append(threshold_message)
|
|
# Update status_description for alert state
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
|
|
# include additional messages in status_message depending on the description field in fields_quality_summary
|
|
if fields_quality_summary:
|
|
quality_results_description = fields_quality_summary.get("quality_results_description", [])
|
|
|
|
for description_item in quality_results_description:
|
|
if description_item.startswith("category: Field does not exist"):
|
|
status_message.append("The field has failed to pass quality verifications (is missing), review the results from the entity field view to troubleshoot these issues")
|
|
elif description_item.startswith("category: Field exists but contains 'unknown'"):
|
|
status_message.append("The field has failed to pass quality verifications (contains unknown values), review the results from the entity field view to troubleshoot these issues")
|
|
elif description_item.startswith("category: Field is empty"):
|
|
status_message.append("The field has failed to pass quality verifications (is empty), review the results from the entity field view to troubleshoot these issues")
|
|
elif description_item.startswith("category: Field is 'unknown'"):
|
|
status_message.append("The field has failed to pass quality verifications (is unknown), review the results from the entity field view to troubleshoot these issues")
|
|
elif description_item.startswith("category: Field exists but value does not match the required pattern"):
|
|
status_message.append("The field has failed to pass the regex pattern validation, review the results from the Search not matching regex from the entity field view to extract the list of values that do not match the required pattern")
|
|
elif description_item.startswith("category: Field exists but one or more values in the list do not match the required pattern"):
|
|
status_message.append("The field has failed to pass the regex pattern validation (list values), review the results from the Search not matching regex from the entity field view to extract the list of values that do not match the required pattern")
|
|
elif description_item.startswith("category: Field does not exist but is allowed to be missing"):
|
|
# Skip this category as it's a success case
|
|
continue
|
|
elif description_item.startswith("category: Field is empty but is allowed to be empty"):
|
|
# Skip this category as it's a success case
|
|
continue
|
|
elif description_item.startswith("category: Field exists and is valid"):
|
|
# Skip this category as it's a success case
|
|
continue
|
|
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
|
|
# include an additional message in status_message if total_fields_failed is greater than 0, including the number of fields that failed
|
|
if total_fields_failed > 0:
|
|
if total_fields_failed == 1:
|
|
status_message.append(f"The global entity has {total_fields_failed} field that failed to pass quality verifications (failed field: {fields_quality_summary.get('failed_fields', [])}), review the results from the entity field view to troubleshoot these issues")
|
|
else:
|
|
status_message.append(f"The global entity has {total_fields_failed} fields that failed to pass quality verifications (failed fields: {fields_quality_summary.get('failed_fields', [])}), review the results from the entity field view to troubleshoot these issues")
|
|
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
|
|
#
|
|
# Logical group management
|
|
#
|
|
|
|
(
|
|
isUnderLogicalGroup,
|
|
LogicalGroupStateInAlert,
|
|
LogicalGroupMsg,
|
|
) = get_and_manage_logical_group_status(
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record.get("object"),
|
|
object_state,
|
|
record.get("object_group_key"),
|
|
object_logical_group_dict,
|
|
)
|
|
|
|
# log debug
|
|
logger.debug(
|
|
f'function get_and_manage_logical_group_status: object="{record.get("object")}", object_state="{object_state}", object_group_key="{record.get("object_group_key")}", isUnderLogicalGroup="{isUnderLogicalGroup}", LogicalGroupStateInAlert="{LogicalGroupStateInAlert}", LogicalGroupMsg="{LogicalGroupMsg}"'
|
|
)
|
|
|
|
# Verify isOutlier
|
|
# Only set red if isOutlier == 1 AND score_outliers > 0 (or score_outliers is None for legacy)
|
|
# If score_outliers <= 0, outliers are suppressed (false positive) and should not cause red state
|
|
if isOutlier == 1:
|
|
if score_outliers is not None:
|
|
if score_outliers > 0:
|
|
# Outliers present with positive score
|
|
if score_outliers >= 100:
|
|
object_state = "red"
|
|
status = 2
|
|
else:
|
|
# score_outliers > 0 and < 100, set to orange
|
|
object_state = "orange"
|
|
status = 3
|
|
# If score_outliers <= 0, don't set state to red/orange (outliers suppressed)
|
|
else:
|
|
# Legacy behavior: if score_outliers is not provided, use isOutlier
|
|
object_state = "red"
|
|
status = 2
|
|
else:
|
|
pass
|
|
|
|
# if object_state is red but isUnderMonitoring is False, then object_state is orange
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
object_state = "orange"
|
|
status = 3
|
|
# Update status_description for orange state
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
|
|
#
|
|
# Hybrid scoring: Apply score-based logic
|
|
# Outliers are handled separately via score_outliers in get_outliers_status
|
|
#
|
|
total_score = None
|
|
score_definition = {}
|
|
if score is not None:
|
|
# Calculate total score with static increments for anomalies
|
|
base_score = float(score) if score is not None else 0.0
|
|
total_score = base_score
|
|
|
|
# Build score definition to track where the score comes from
|
|
# Convert base_score to integer if it's a whole number, otherwise keep as float
|
|
if base_score == int(base_score):
|
|
score_definition["base_score"] = int(base_score)
|
|
else:
|
|
score_definition["base_score"] = base_score
|
|
score_definition["components"] = []
|
|
|
|
# Add static increments for each anomaly type
|
|
if threshold_alert == 1:
|
|
# Use threshold scores if provided, otherwise default to 100
|
|
if threshold_scores and len(threshold_scores) > 0:
|
|
# Sum all threshold scores (multiple thresholds can be breached)
|
|
increment = sum(threshold_scores)
|
|
else:
|
|
# Default to 100 for backward compatibility
|
|
increment = 100
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "threshold_breach",
|
|
"score": increment,
|
|
"description": "Threshold alert breached"
|
|
})
|
|
|
|
# Add outlier score if present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_definition["score_outliers"] = float(score_outliers)
|
|
|
|
# Add score sources if available
|
|
score_source = record.get("score_source", [])
|
|
if score_source:
|
|
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
|
|
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Apply score-based logic:
|
|
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
|
|
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
|
|
# - If total_score == 0: keep current state
|
|
|
|
if total_score >= 100:
|
|
# If score >= 100, ensure entity is red (unless it's blue due to logical group)
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
status = 2
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red (score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
|
|
)
|
|
elif total_score > 0 and total_score < 100:
|
|
# If score > 0 and < 100, entity should be orange (even if currently green)
|
|
if object_state == "green":
|
|
object_state = "orange"
|
|
status = 3
|
|
# Update status_description for orange state
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
# Add status message about score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
# Add outlier context if outliers are present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
|
|
)
|
|
elif object_state == "red":
|
|
# Downgrade red to orange if score < 100
|
|
# Only apply score-based downgrade if the red state is NOT due to outliers
|
|
# (outliers with score_outliers >= 100 should still be red)
|
|
if isOutlier != 1:
|
|
object_state = "orange"
|
|
status = 3
|
|
# Update status_description for orange state
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
|
|
)
|
|
else:
|
|
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
|
|
# in get_outliers_status, so we can still apply score-based logic
|
|
if score_outliers is not None and score_outliers < 100:
|
|
object_state = "orange"
|
|
status = 3
|
|
# Update status_description for orange state
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'downgrading red to orange (outlier score too low)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
|
|
)
|
|
else:
|
|
# total_score == 0 or total_score <= 0
|
|
# Check if score is 0 due to false_positive (global false positive, not just outliers)
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive:
|
|
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
|
|
object_state = "green"
|
|
status = 1
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_source="{score_source}", '
|
|
f'setting state to green (false positive set, score cancelled)'
|
|
)
|
|
elif score_outliers is not None and score_outliers <= 0 and threshold_alert != 1:
|
|
# Outliers are suppressed (false positive), and no other issues, set to green
|
|
object_state = "green"
|
|
status = 1
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'setting state to green (outliers suppressed, no other issues)'
|
|
)
|
|
else:
|
|
# Keep current state if there are other issues or legacy behavior
|
|
logging.debug(
|
|
f'set_fqm_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'threshold_alert="{threshold_alert}", keeping current state (score == 0)'
|
|
)
|
|
|
|
# define anomaly_reason
|
|
if object_state == "green":
|
|
if fqm_type == "field":
|
|
status_message_str = f"The field {record.get('fieldname')} is complying with monitoring rules, % success: {percent_success}"
|
|
elif fqm_type == "global":
|
|
status_message_str = f"The global entity is complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_message.append(status_message_str)
|
|
|
|
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive and score_definition and "components" in score_definition:
|
|
# Extract anomaly reasons from score_definition components
|
|
for component in score_definition.get("components", []):
|
|
component_type = component.get("type")
|
|
if component_type:
|
|
mapped_reason = get_anomaly_reason_from_component_type(component_type)
|
|
if mapped_reason and mapped_reason not in anomaly_reason:
|
|
anomaly_reason.append(mapped_reason)
|
|
# If no components found, still add "none"
|
|
if not anomaly_reason:
|
|
anomaly_reason.append("none")
|
|
else:
|
|
anomaly_reason.append("none")
|
|
|
|
# if in a logical group, add the logical group message
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
|
|
#
|
|
# Inactive entities management
|
|
#
|
|
|
|
# get max_sec_inactive
|
|
max_sec_inactive = record.get("max_sec_inactive", 0)
|
|
try:
|
|
max_sec_inactive = int(max_sec_inactive)
|
|
except Exception as e:
|
|
max_sec_inactive = 0
|
|
|
|
# get the age in seconds since the latest execution
|
|
sec_since_last_execution = round(
|
|
time.time() - float(record.get("tracker_runtime")), 0
|
|
)
|
|
duration_since_last_execution = convert_seconds_to_duration(
|
|
sec_since_last_execution
|
|
)
|
|
|
|
# Check and act
|
|
if float(sec_since_last_execution) > max_sec_inactive and max_sec_inactive > 0:
|
|
status_message_str = f"This entity has been inactive for more than {duration_since_last_execution} (D+HH:MM:SS) and was not actively managed by any tracker, its status was updated automatically by the inactive entities tracker"
|
|
status_message = [status_message_str]
|
|
status_description_short = "entity is red due to inactivity"
|
|
status_description = f"The entity status is red due to inactivity, it was not actively managed by any tracker for more than {duration_since_last_execution} (D+HH:MM:SS)"
|
|
anomaly_reason = ["inactive"]
|
|
object_state = "red"
|
|
status = 2
|
|
# in this case, we need to update the status_description and status_description_short
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["object_state"] = object_state
|
|
|
|
#
|
|
# end of inactive entities management
|
|
#
|
|
|
|
#
|
|
# Red status due to upstream logic / Orange state
|
|
#
|
|
|
|
if (object_state == "red" and not threshold_alert) or object_state == "orange":
|
|
status_message_str = f"The entity status is not complying with monitoring rules (status: {status}, status_description: {status_description})"
|
|
status_message.append(status_message_str)
|
|
anomaly_reason.append("status_not_met")
|
|
# Update status_description for alert state if not already set
|
|
if fqm_type == "field":
|
|
status_description = f"The field {record.get('fieldname')} is not complying with monitoring rules, % success: {percent_success}, % coverage: {percent_coverage}"
|
|
status_description_short = f"% success: {percent_success}, % coverage: {percent_coverage}"
|
|
elif fqm_type == "global":
|
|
status_description = f"The global entity is not complying with monitoring rules, % success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
status_description_short = f"% success: {percent_success}, fields passed: {total_fields_passed}, fields failed: {total_fields_failed}"
|
|
record["status_description"] = status_description
|
|
record["status_description_short"] = status_description_short
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
# Add score increment for status_not_met if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_fqm_status_not_met", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "status_not_met",
|
|
"score": increment,
|
|
"description": "Status not met"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Other statements
|
|
if object_state == "red":
|
|
|
|
# Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
|
|
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
|
|
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
|
|
outlier_reasons = record.get("isOutlierReason", [])
|
|
if outlier_reasons:
|
|
if isinstance(outlier_reasons, list):
|
|
# Join the list elements into a single string
|
|
outlier_reasons_str = " | ".join(outlier_reasons)
|
|
status_message.append(outlier_reasons_str)
|
|
else:
|
|
# If it's not a list, append it directly
|
|
status_message.append(outlier_reasons)
|
|
# Add ml_outliers_detection to anomaly_reason for all outlier cases
|
|
if "ml_outliers_detection" not in anomaly_reason:
|
|
anomaly_reason.append("ml_outliers_detection")
|
|
|
|
# Add status message for orange state (score_outliers > 0 and < 100)
|
|
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
|
|
base_score = float(score) if score is not None else 0.0
|
|
status_message.append(
|
|
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
|
|
# Monitoring time policy, add the message first then the anomaly reason
|
|
if isUnderMonitoring is False:
|
|
status_message.append(isUnderMonitoringMsg)
|
|
# Use new monitoring anomaly reason if provided
|
|
if monitoring_anomaly_reason:
|
|
anomaly_reason.append(monitoring_anomaly_reason)
|
|
|
|
# logical group
|
|
if isUnderLogicalGroup is True:
|
|
status_message.append(LogicalGroupMsg)
|
|
anomaly_reason.append("in_logical_group")
|
|
|
|
#
|
|
# Logical group management (object_state is red but in a logical group which is not in alert)
|
|
#
|
|
|
|
# if object_state is red but isUnderLogicalGroup is True and LogicalGroupStateInAlert is False, then object_state is blue
|
|
if object_state == "red" and isUnderLogicalGroup is True:
|
|
if LogicalGroupStateInAlert is False:
|
|
object_state = "blue"
|
|
status = 3
|
|
#
|
|
# Out of monitoring days and hours management
|
|
#
|
|
|
|
# if object_state is red but isUnderMonitoring is False, then object_state is orange
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
object_state = "orange"
|
|
status = 3
|
|
|
|
# update status, object_state, anomaly_reason and metrics
|
|
record["status"] = status
|
|
record["object_state"] = object_state
|
|
record["anomaly_reason"] = anomaly_reason
|
|
|
|
# ensure status metric in metrics is updated
|
|
try:
|
|
metrics_record = record.get("metrics", {})
|
|
if isinstance(metrics_record, str):
|
|
metrics_record = json.loads(metrics_record)
|
|
metrics_record["status"] = status
|
|
record["metrics"] = json.dumps(metrics_record)
|
|
except Exception as e:
|
|
pass
|
|
status_message_json["status_message"] = status_message
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
|
|
# Add score information to status_message_json for UI display (sorted alphabetically)
|
|
# Use total_score if calculated (hybrid scoring), otherwise use base score
|
|
if total_score is not None:
|
|
status_message_json["score"] = float(total_score)
|
|
# Update record score to reflect the calculated total_score for UI consistency
|
|
record["score"] = float(total_score)
|
|
# Add score definition for drilldown modal
|
|
if score_definition:
|
|
status_message_json["score_definition"] = score_definition
|
|
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
|
|
elif score is not None:
|
|
status_message_json["score"] = float(score)
|
|
if score_outliers is not None:
|
|
status_message_json["score_outliers"] = float(score_outliers)
|
|
if total_score is not None:
|
|
status_message_json["total_score"] = float(total_score)
|
|
|
|
# handle fields_quality_summary
|
|
try:
|
|
fields_quality_summary = record.get("fields_quality_summary", {})
|
|
if isinstance(fields_quality_summary, str):
|
|
fields_quality_summary = json.loads(fields_quality_summary)
|
|
except Exception as e:
|
|
fields_quality_summary = {}
|
|
pass
|
|
if fields_quality_summary:
|
|
status_message_json["fields_quality_summary"] = fields_quality_summary
|
|
|
|
# get disruption_duration
|
|
if not disruption_queue_record:
|
|
record["disruption_min_time_sec"] = 0
|
|
|
|
else:
|
|
|
|
logger.debug(
|
|
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
|
|
)
|
|
|
|
disruption_object_state = disruption_queue_record.get("object_state", "green")
|
|
try:
|
|
disruption_min_time_sec = int(
|
|
disruption_queue_record.get("disruption_min_time_sec", 0)
|
|
)
|
|
except:
|
|
disruption_min_time_sec = 0
|
|
# add to the record
|
|
record["disruption_min_time_sec"] = disruption_min_time_sec
|
|
|
|
try:
|
|
disruption_start_epoch = float(
|
|
disruption_queue_record.get("disruption_start_epoch", 0)
|
|
)
|
|
except:
|
|
disruption_start_epoch = 0
|
|
|
|
# Case 1: Entity is no longer in alert state (not red)
|
|
if object_state != "red":
|
|
# Only update if we were previously tracking a disruption
|
|
if disruption_object_state == "red":
|
|
disruption_queue_record["object_state"] = object_state
|
|
disruption_queue_record["disruption_start_epoch"] = 0
|
|
disruption_queue_record["mtime"] = time.time()
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# Case 2: Entity is in alert state (red)
|
|
if object_state == "red":
|
|
current_time = time.time()
|
|
|
|
# If this is a new disruption, start tracking it
|
|
if disruption_object_state != "red":
|
|
disruption_queue_record["object_state"] = "red"
|
|
disruption_queue_record["disruption_start_epoch"] = current_time
|
|
disruption_queue_record["mtime"] = current_time
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
|
|
# For new disruptions, if min time is set, show as blue with message
|
|
if disruption_min_time_sec > 0:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# If we're already tracking a disruption, check duration
|
|
if disruption_min_time_sec > 0:
|
|
try:
|
|
disruption_duration = current_time - disruption_start_epoch
|
|
except Exception as e:
|
|
logger.error(f"error calculating disruption_duration: {e}")
|
|
disruption_duration = 0
|
|
|
|
# If duration hasn't breached threshold, show as blue with message
|
|
if disruption_duration < disruption_min_time_sec:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
|
|
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
|
|
# return
|
|
logging.debug(
|
|
f'set_fqm_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
|
|
def set_wlk_status(
|
|
logger,
|
|
splunkd_uri,
|
|
session_key,
|
|
tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler=None,
|
|
monitoring_anomaly_reason=None,
|
|
score=None,
|
|
score_outliers=None,
|
|
vtenant_account=None,
|
|
):
|
|
"""
|
|
Create a function called set_wlk_status:
|
|
- arguments: record, isOutlier, isFuture, isUnderMonitoring, isUnderMonitoringMsg, isUnderLogicalGroup, LogicalGroupStateInAlert, isUnderLatencyAlert, isUnderLatencyMessage, isUnderDelayAlert, isUnderDelayMessage
|
|
- returns:
|
|
object_state (string): blue, orange, green, red
|
|
anomaly_reason (list): list of short code reasons why the object is in anomaly
|
|
status_message (list): list of long description reasons why the object is in anomaly
|
|
- behaviour:
|
|
object_state:
|
|
green if:
|
|
isOutlier is 1
|
|
isFuture is False
|
|
isUnderMonitoring is True
|
|
if isUnderLogicalGroup is True, then LogicalGroupStateInAlert must be False
|
|
isUnderLatencyAlert is False
|
|
isUnderDelayAlert is False
|
|
blue if:
|
|
Any of the condition above is not met, but isUnderLogicalGroup is True and LogicalGroupStateInAlert is True
|
|
orange if:
|
|
All green conditions are met except for isFuture which would be True
|
|
red if:
|
|
Any of the green conditions are not met, and blue conditions and orange conditions are not met
|
|
anomaly_reason:
|
|
if object_state is green, anomnaly_reason is None
|
|
Otherwise, anomaly_reason is a list containing the reasons why the object is in anomaly
|
|
"""
|
|
|
|
# init status_message and anomaly_reason
|
|
status_message = []
|
|
anomaly_reason = []
|
|
|
|
# init status_message_json
|
|
status_message_json = {}
|
|
|
|
# for wlk, first retrieve object_state which is defined upstream
|
|
object_state = record.get("object_state", "green")
|
|
|
|
# status and status_description are used to compose the anomaly_reason
|
|
status = record.get("status", "unknown")
|
|
status_description = record.get("status_description", "unknown")
|
|
|
|
# Verify isOutlier
|
|
# Only set red if isOutlier == 1 AND score_outliers > 0 (or score_outliers is None for legacy)
|
|
# If score_outliers <= 0, outliers are suppressed (false positive) and should not cause red state
|
|
if isOutlier == 1:
|
|
if score_outliers is not None:
|
|
if score_outliers > 0:
|
|
# Outliers present with positive score
|
|
if score_outliers >= 100:
|
|
object_state = "red"
|
|
else:
|
|
# score_outliers > 0 and < 100, set to orange
|
|
object_state = "orange"
|
|
# If score_outliers <= 0, don't set state to red/orange (outliers suppressed)
|
|
else:
|
|
# Legacy behavior: if score_outliers is not provided, use isOutlier
|
|
object_state = "red"
|
|
else:
|
|
pass
|
|
|
|
# for wlk, get various functional fields used for the anomaly_reason and status_message definition
|
|
# skipping KPis: skipped_pct, skipped_pct_last_60m, skipped_pct_last_4h, skipped_pct_last_24h
|
|
try:
|
|
skipped_pct = float(record.get("skipped_pct", 0))
|
|
except Exception as e:
|
|
skipped_pct = 0
|
|
|
|
try:
|
|
skipped_pct_last_60m = float(record.get("skipped_pct_last_60m", 0))
|
|
except Exception as e:
|
|
skipped_pct_last_60m = 0
|
|
|
|
try:
|
|
skipped_pct_last_4h = float(record.get("skipped_pct_last_4h", 0))
|
|
except Exception as e:
|
|
skipped_pct_last_4h = 0
|
|
|
|
try:
|
|
skipped_pct_last_24h = float(record.get("skipped_pct_last_24h", 0))
|
|
except Exception as e:
|
|
skipped_pct_last_24h = 0
|
|
|
|
# similarly, load:
|
|
# count_errors, count_errors_last_60m, count_errors_last_4h, count_errors_last_24h
|
|
try:
|
|
count_errors = int(record.get("count_errors", 0))
|
|
except Exception as e:
|
|
count_errors = 0
|
|
|
|
try:
|
|
count_errors_last_60m = int(record.get("count_errors_last_60m", 0))
|
|
except Exception as e:
|
|
count_errors_last_60m = 0
|
|
|
|
try:
|
|
count_errors_last_4h = int(record.get("count_errors_last_4h", 0))
|
|
except Exception as e:
|
|
count_errors_last_4h = 0
|
|
|
|
try:
|
|
count_errors_last_24h = int(record.get("count_errors_last_24h", 0))
|
|
except Exception as e:
|
|
count_errors_last_24h = 0
|
|
|
|
# retrieve last_seen (epochtime) and cron_exec_sequence_sec (value in seconds)
|
|
try:
|
|
last_seen = int(record.get("last_seen", 0))
|
|
except Exception as e:
|
|
last_seen = 0
|
|
|
|
# get last_seen_datetime
|
|
if last_seen > 0:
|
|
last_seen_datetime = convert_epoch_to_datetime(last_seen)
|
|
else:
|
|
last_seen_datetime = "unknown"
|
|
|
|
try:
|
|
cron_exec_sequence_sec = int(record.get("cron_exec_sequence_sec", 0))
|
|
except Exception as e:
|
|
cron_exec_sequence_sec = 0
|
|
|
|
# calculate isDelayed (0 or 1)
|
|
# if now()-last_seen)>(cron_exec_sequence_sec+3600, isDelayed is 1
|
|
now = time.time()
|
|
if (now - last_seen) > (cron_exec_sequence_sec + 3600):
|
|
isDelayed = 1
|
|
else:
|
|
isDelayed = 0
|
|
|
|
# calculate the current delay in seconds
|
|
current_delay = now - last_seen
|
|
|
|
# get the current delay durection
|
|
current_delay_duration = convert_seconds_to_duration(current_delay)
|
|
|
|
# retrieve orphan boolean (0 or 1) and load as an integer, as well as orphan_last_check (human readable date)
|
|
try:
|
|
orphan = int(record.get("orphan", 0))
|
|
except Exception as e:
|
|
orphan = 0
|
|
|
|
orphan_last_check = record.get("orphan_last_check", "unknown")
|
|
|
|
# if object_state is red but isUnderMonitoring is False, then object_state is orange
|
|
if object_state == "red":
|
|
if isUnderMonitoring is False:
|
|
object_state = "orange"
|
|
|
|
#
|
|
# Hybrid scoring: Apply score-based logic
|
|
# Outliers are handled separately via score_outliers in get_outliers_status
|
|
#
|
|
total_score = None
|
|
score_definition = {}
|
|
if score is not None:
|
|
# Calculate total score with static increments for anomalies
|
|
base_score = float(score) if score is not None else 0.0
|
|
total_score = base_score
|
|
|
|
# Build score definition to track where the score comes from
|
|
# Convert base_score to integer if it's a whole number, otherwise keep as float
|
|
if base_score == int(base_score):
|
|
score_definition["base_score"] = int(base_score)
|
|
else:
|
|
score_definition["base_score"] = base_score
|
|
score_definition["components"] = []
|
|
|
|
# Add outlier score if present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_definition["score_outliers"] = float(score_outliers)
|
|
|
|
# Add score sources if available
|
|
score_source = record.get("score_source", [])
|
|
if score_source:
|
|
score_definition["score_source"] = score_source if isinstance(score_source, list) else [score_source]
|
|
|
|
# Note: Score increments for WLK anomalies are added later when anomalies are detected
|
|
# This ensures scoring happens in sync with anomaly_reason detection
|
|
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Apply score-based logic:
|
|
# - If total_score >= 100: entity should be red (if not already red due to other reasons, keep current state)
|
|
# - If total_score > 0 and < 100: entity should be orange (even if currently green)
|
|
# - If total_score == 0: keep current state
|
|
|
|
if total_score >= 100:
|
|
# If score >= 100, ensure entity is red
|
|
if object_state not in ["red", "blue"]:
|
|
object_state = "red"
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting state to red (score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (score >= 100)'
|
|
)
|
|
elif total_score > 0 and total_score < 100:
|
|
# If score > 0 and < 100, entity should be orange (even if currently green)
|
|
if object_state == "green":
|
|
object_state = "orange"
|
|
# Add status message about score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
# Add outlier context if outliers are present
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", setting green to orange (0 < score < 100)'
|
|
)
|
|
elif object_state == "red":
|
|
# Downgrade red to orange if score < 100
|
|
# Only apply score-based downgrade if the red state is NOT due to outliers
|
|
# (outliers with score_outliers >= 100 should still be red)
|
|
if isOutlier != 1:
|
|
object_state = "orange"
|
|
# Add status message about score when downgrading
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
if score_outliers is not None and score_outliers > 0:
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", downgrading red to orange (non-outlier anomalies only)'
|
|
)
|
|
else:
|
|
# If outlier is present but score_outliers < 100, it was already set to isOutlier=2
|
|
# in get_outliers_status, so we can still apply score-based logic
|
|
if score_outliers is not None and score_outliers < 100:
|
|
object_state = "orange"
|
|
# Add status message about score when downgrading due to low outlier score
|
|
score_msg = f"Entity has an impact score of {total_score:.1f} (base score: {score:.1f}), which is above 0 but below 100. "
|
|
score_msg += f"Outlier anomalies detected with a score of {score_outliers:.1f}. "
|
|
score_msg += "This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
status_message.append(score_msg)
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'downgrading red to orange (outlier score too low)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping red state (outlier score >= 100)'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", keeping {object_state} state (0 < score < 100)'
|
|
)
|
|
else:
|
|
# total_score == 0 or total_score <= 0
|
|
# Check if score is 0 due to false_positive (global false positive, not just outliers)
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive:
|
|
# Score is 0 due to false_positive, set to green (anomaly_reason will remain visible for audit)
|
|
object_state = "green"
|
|
status = 1
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_source="{score_source}", '
|
|
f'setting state to green (false positive set, score cancelled)'
|
|
)
|
|
elif score_outliers is not None and score_outliers <= 0:
|
|
# Outliers are suppressed (false positive), set to green
|
|
object_state = "green"
|
|
status = 1
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'setting state to green (outliers suppressed)'
|
|
)
|
|
else:
|
|
# Keep current state if legacy behavior
|
|
logging.debug(
|
|
f'set_wlk_status, hybrid scoring: object="{record.get("object")}", '
|
|
f'total_score="{total_score}", score_outliers="{score_outliers}", '
|
|
f'keeping current state (score == 0)'
|
|
)
|
|
|
|
# define anomaly_reason
|
|
if object_state == "green":
|
|
status_message_str = f"The entity status is complying with monitoring rules (status: {status}, status_description: {status_description})"
|
|
status_message.append(status_message_str)
|
|
|
|
# Check if false positive is set - if so, preserve anomaly reasons from score_definition
|
|
score_source = record.get("score_source", [])
|
|
score_source_list = score_source if isinstance(score_source, list) else ([score_source] if score_source else [])
|
|
has_false_positive = "false_positive" in score_source_list
|
|
|
|
if has_false_positive and score_definition and "components" in score_definition:
|
|
# Extract anomaly reasons from score_definition components
|
|
for component in score_definition.get("components", []):
|
|
component_type = component.get("type")
|
|
if component_type:
|
|
mapped_reason = get_anomaly_reason_from_component_type(component_type)
|
|
if mapped_reason and mapped_reason not in anomaly_reason:
|
|
anomaly_reason.append(mapped_reason)
|
|
# If no components found, still add "none"
|
|
if not anomaly_reason:
|
|
anomaly_reason.append("none")
|
|
else:
|
|
anomaly_reason.append("none")
|
|
|
|
else:
|
|
# Other statements
|
|
|
|
# ML Outlier: Check for outliers: either isOutlier == 1 (traditional) or score_outliers > 0 (hybrid scoring)
|
|
if isOutlier == 1 or (score_outliers is not None and score_outliers > 0):
|
|
# Always add outlier reasons when outliers are present (either traditional or hybrid scoring)
|
|
outlier_reasons = record.get("isOutlierReason", [])
|
|
if outlier_reasons:
|
|
if isinstance(outlier_reasons, list):
|
|
# Join the list elements into a single string
|
|
outlier_reasons_str = " | ".join(outlier_reasons)
|
|
status_message.append(outlier_reasons_str)
|
|
else:
|
|
# If it's not a list, append it directly
|
|
status_message.append(outlier_reasons)
|
|
# Add anomaly reason for outliers (either traditional or hybrid scoring)
|
|
if "ml_outliers_detection" not in anomaly_reason:
|
|
anomaly_reason.append("ml_outliers_detection")
|
|
|
|
# Add status message for orange state (score_outliers > 0 and < 100)
|
|
if score_outliers is not None and score_outliers > 0 and score_outliers < 100:
|
|
base_score = float(score) if score is not None else 0.0
|
|
status_message.append(
|
|
f"Entity has an impact score of {score_outliers:.1f} (base score: {base_score:.1f}), which is above 0 but below 100. "
|
|
f"This indicates potential anomalies that require attention but do not yet warrant a critical alert status."
|
|
)
|
|
|
|
# Skipping
|
|
if (
|
|
skipped_pct > 0
|
|
or skipped_pct_last_60m > 0
|
|
or skipped_pct_last_4h > 0
|
|
or skipped_pct_last_24h > 0
|
|
):
|
|
status_message.append(
|
|
f"skipping searches were detected, review and address performance issues for this search or finetune its scheduling plan to clear this alert. (skipped_pct_last_60m: {skipped_pct_last_60m}, skipped_pct_last_4h: {skipped_pct_last_4h}, skipped_pct_last_24h: {skipped_pct_last_24h})"
|
|
)
|
|
anomaly_reason.append("skipping_searches_detected")
|
|
# Add score increment for skipping searches if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_wlk_skipping_searches", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "skipping_searches_detected",
|
|
"score": increment,
|
|
"description": "Skipping searches detected"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Errors count
|
|
if (
|
|
count_errors > 0
|
|
or count_errors_last_60m > 0
|
|
or count_errors_last_4h > 0
|
|
or count_errors_last_24h > 0
|
|
):
|
|
status_message.append(
|
|
f"execution errors were detected, review and address these errors to clear this alert. (count_errors_last_60m: {count_errors_last_60m}, count_errors_last_4h: {count_errors_last_4h}, count_errors_last_24h: {count_errors_last_24h})"
|
|
)
|
|
anomaly_reason.append("execution_errors_detected")
|
|
# Add score increment for execution errors if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_wlk_execution_errors", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "execution_errors_detected",
|
|
"score": increment,
|
|
"description": "Execution errors detected"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# orphan
|
|
if orphan == 1:
|
|
status_message.append(
|
|
f"the search was detected as an orphan search which means the user owning the search is not currently a valid user (orphan: {orphan}, time check: {orphan_last_check}"
|
|
)
|
|
anomaly_reason.append("orphan_search_detected")
|
|
# Add score increment for orphan search if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_wlk_orphan_search", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "orphan_search_detected",
|
|
"score": increment,
|
|
"description": "Orphan search detected"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# delayed
|
|
if isDelayed == 1 and cron_exec_sequence_sec > 0:
|
|
status_message.append(
|
|
f"the search was detected as delayed, this means the search is not running as expected (isDelayed: {isDelayed}, last_seen: {last_seen_datetime}, cron_exec_sequence_sec: {cron_exec_sequence_sec}, current delay: {current_delay_duration} duration)"
|
|
)
|
|
anomaly_reason.append("execution_delayed")
|
|
# Add score increment for execution delayed if scoring is enabled (using VT-specific impact score)
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_wlk_execution_delayed", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "execution_delayed",
|
|
"score": increment,
|
|
"description": "Execution delayed"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# Monitoring time policy, add the message first then the anomaly reason
|
|
if isUnderMonitoring is False:
|
|
status_message.append(isUnderMonitoringMsg)
|
|
# Use new monitoring anomaly reason if provided, otherwise use legacy
|
|
if monitoring_anomaly_reason:
|
|
anomaly_reason.append(monitoring_anomaly_reason)
|
|
else:
|
|
anomaly_reason.append("out_of_monitoring_times")
|
|
# Add score increment for out of monitoring times if scoring is enabled
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_wlk_out_of_monitoring_times", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "out_of_monitoring_times",
|
|
"score": increment,
|
|
"description": "Out of monitoring times"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# if we failed to identify the reason
|
|
if len(status_message) == 0:
|
|
status_message_str = f"The entity status is not complying with monitoring rules (status: {status}, status_description: {status_description})"
|
|
status_message.append(status_message_str)
|
|
anomaly_reason.append("status_not_met")
|
|
# Add score increment for status_not_met if scoring is enabled
|
|
if score is not None and total_score is not None:
|
|
increment = get_impact_score(vtenant_account, "impact_score_wlk_status_not_met", 100)
|
|
total_score += increment
|
|
score_definition["components"].append({
|
|
"type": "status_not_met",
|
|
"score": increment,
|
|
"description": "Status not met"
|
|
})
|
|
# Convert total_score to integer if it's a whole number, otherwise keep as float
|
|
if total_score is not None:
|
|
if total_score == int(total_score):
|
|
score_definition["total_score"] = int(total_score)
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
else:
|
|
score_definition["total_score"] = total_score
|
|
|
|
# form status_message_json
|
|
status_message_json["status_message"] = status_message
|
|
status_message_json["anomaly_reason"] = anomaly_reason
|
|
|
|
# Add score information to status_message_json for UI display (sorted alphabetically)
|
|
# Use total_score if calculated (hybrid scoring), otherwise use base score
|
|
if total_score is not None:
|
|
status_message_json["score"] = float(total_score)
|
|
# Update record score to reflect the calculated total_score for UI consistency
|
|
record["score"] = float(total_score)
|
|
# Add score definition for drilldown modal
|
|
if score_definition:
|
|
status_message_json["score_definition"] = score_definition
|
|
record["score_definition"] = json.dumps(score_definition) if isinstance(score_definition, dict) else score_definition
|
|
elif score is not None:
|
|
status_message_json["score"] = float(score)
|
|
if score_outliers is not None:
|
|
status_message_json["score_outliers"] = float(score_outliers)
|
|
if total_score is not None:
|
|
status_message_json["total_score"] = float(total_score)
|
|
|
|
# get disruption_duration
|
|
if not disruption_queue_record:
|
|
record["disruption_min_time_sec"] = 0
|
|
|
|
else:
|
|
|
|
logger.debug(
|
|
f'disruption_queue_record="{disruption_queue_record}", getting disruption_duration'
|
|
)
|
|
|
|
disruption_object_state = disruption_queue_record.get("object_state", "green")
|
|
try:
|
|
disruption_min_time_sec = int(
|
|
disruption_queue_record.get("disruption_min_time_sec", 0)
|
|
)
|
|
except:
|
|
disruption_min_time_sec = 0
|
|
# add to the record
|
|
record["disruption_min_time_sec"] = disruption_min_time_sec
|
|
|
|
try:
|
|
disruption_start_epoch = float(
|
|
disruption_queue_record.get("disruption_start_epoch", 0)
|
|
)
|
|
except:
|
|
disruption_start_epoch = 0
|
|
|
|
# Case 1: Entity is no longer in alert state (not red)
|
|
if object_state != "red":
|
|
# Only update if we were previously tracking a disruption
|
|
if disruption_object_state == "red":
|
|
disruption_queue_record["object_state"] = object_state
|
|
disruption_queue_record["disruption_start_epoch"] = 0
|
|
disruption_queue_record["mtime"] = time.time()
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# Case 2: Entity is in alert state (red)
|
|
if object_state == "red":
|
|
current_time = time.time()
|
|
|
|
# If this is a new disruption, start tracking it
|
|
if disruption_object_state != "red":
|
|
disruption_queue_record["object_state"] = "red"
|
|
disruption_queue_record["disruption_start_epoch"] = current_time
|
|
disruption_queue_record["mtime"] = current_time
|
|
|
|
try:
|
|
disruption_queue_update(
|
|
disruption_queue_collection, disruption_queue_record
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"error updating disruption_queue_record: {e}")
|
|
|
|
# For new disruptions, if min time is set, show as blue with message
|
|
if disruption_min_time_sec > 0:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is 0 which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
# If we're already tracking a disruption, check duration
|
|
if disruption_min_time_sec > 0:
|
|
try:
|
|
disruption_duration = current_time - disruption_start_epoch
|
|
except Exception as e:
|
|
logger.error(f"error calculating disruption_duration: {e}")
|
|
disruption_duration = 0
|
|
|
|
# If duration hasn't breached threshold, show as blue with message
|
|
if disruption_duration < disruption_min_time_sec:
|
|
object_state = "blue"
|
|
status_message.append(
|
|
f"Minimal disruption time is configured for this entity, the current disruption duration is {convert_seconds_to_duration(disruption_duration)} which does not breach yet the minimal disruption time of {convert_seconds_to_duration(disruption_min_time_sec)}"
|
|
)
|
|
status_message_json["status_message"] = status_message
|
|
|
|
# anomaly_reason sanitify check, if the list has more than 1 item, and contains "none", remove it
|
|
if isinstance(anomaly_reason, list):
|
|
if len(anomaly_reason) > 1 and "none" in anomaly_reason:
|
|
anomaly_reason.remove("none")
|
|
|
|
# return
|
|
logging.debug(
|
|
f'set_wlk_status, object="{record.get("object")}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
return object_state, status_message, status_message_json, anomaly_reason
|
|
|
|
|
|
def ack_check(object_value, ack_collection_keys, ack_collection_dict, record):
|
|
"""
|
|
Updates record with ack information if object_value exists in ack_collection_keys
|
|
and the object categories match.
|
|
"""
|
|
|
|
ack_defaults = {
|
|
"ack_state": "inactive",
|
|
"ack_type": "N/A",
|
|
"ack_comment": "N/A",
|
|
"ack_expiration": "N/A",
|
|
"ack_mtime": "N/A",
|
|
}
|
|
|
|
if object_value in ack_collection_keys:
|
|
ack_record = ack_collection_dict.get(object_value)
|
|
# Check if ack_record exists and object_category matches
|
|
if ack_record and ack_record.get("object_category") == record.get(
|
|
"object_category"
|
|
):
|
|
# Extract ack information from ack_record
|
|
for key in ack_defaults.keys():
|
|
record[key] = ack_record.get(key, ack_defaults[key])
|
|
else:
|
|
record.update(ack_defaults)
|
|
else:
|
|
record.update(ack_defaults)
|
|
|
|
|
|
def define_state_icon_code(record):
|
|
"""
|
|
Determines the state_icon_code based on the object_state and ack_state
|
|
contained within the record dictionary.
|
|
|
|
Args:
|
|
- record (dict): A dictionary containing 'object_state' and 'ack_state'.
|
|
|
|
Returns:
|
|
- str: The state_icon_code determined based on the provided conditions.
|
|
"""
|
|
object_state = record.get("object_state")
|
|
ack_state = record.get("ack_state")
|
|
|
|
# Define a mapping based on the Splunk macro conditions
|
|
state_icon_code_mapping = {
|
|
("green", "inactive"): "001",
|
|
("green", "active"): "002",
|
|
("green", None): "003",
|
|
("red", "inactive"): "004",
|
|
("red", "active"): "005",
|
|
("red", None): "006",
|
|
("orange", "inactive"): "007",
|
|
("orange", "active"): "008",
|
|
("orange", None): "009",
|
|
("blue", "inactive"): "010",
|
|
("blue", "active"): "011",
|
|
("blue", None): "012",
|
|
}
|
|
|
|
# Fallback code if none of the conditions match
|
|
default_code = "999"
|
|
|
|
# Determine state_icon_code based on object_state and ack_state
|
|
state_icon_code = state_icon_code_mapping.get(
|
|
(object_state, ack_state),
|
|
state_icon_code_mapping.get((object_state, None), default_code),
|
|
)
|
|
|
|
return state_icon_code
|
|
|
|
|
|
def outliers_readiness(record):
|
|
"""
|
|
Updates the record with outliers_readiness, OutliersIsOk, and OutliersStatus based on the record's values.
|
|
Ensures robust handling of missing or non-integer isOutlier values.
|
|
"""
|
|
# Set outliers_readiness based on its current value
|
|
record["outliers_readiness"] = (
|
|
"True" if record.get("outliers_readiness") == "True" else "False"
|
|
)
|
|
|
|
# Safely get and convert isOutlier to an integer
|
|
try:
|
|
is_outlier = int(record.get("isOutlier", 0))
|
|
except ValueError:
|
|
# Handle case where conversion fails
|
|
is_outlier = 0
|
|
|
|
# Determine OutliersIsOk based on isOutlier
|
|
record["OutliersIsOk"] = 1 if is_outlier == 0 else 0
|
|
|
|
# Set OutliersStatus based on OutliersIsOk (always use text-based status)
|
|
record["OutliersStatus"] = "green" if record["OutliersIsOk"] == 1 else "red"
|
|
|
|
|
|
def sampling_anomaly_status(record):
|
|
"""
|
|
Updates the record with SamplingIsOk and SamplingStatus based on the isAnomaly field.
|
|
"""
|
|
|
|
# get isAnomaly
|
|
try:
|
|
isAnomaly = int(record.get("isAnomaly", 0))
|
|
except Exception as e:
|
|
isAnomaly = 0
|
|
|
|
# define SamplingIsOk
|
|
record["SamplingIsOk"] = 1 if isAnomaly == 0 else 1
|
|
|
|
# define SamplingStatus (always use text-based status)
|
|
record["SamplingStatus"] = "green" if record["SamplingIsOk"] == 1 else "red"
|
|
|
|
|
|
def logical_group_lookup(
|
|
object_value,
|
|
logicalgroup_members_collection_keys,
|
|
logicalgroup_members_collection_dict,
|
|
record,
|
|
):
|
|
"""
|
|
Updates record with Logical Group information if object_value exists in lg_collection_keys.
|
|
"""
|
|
|
|
logicalgroup_defaults = {
|
|
"object_group_key": None,
|
|
"object_group_name": None,
|
|
}
|
|
|
|
if object_value in logicalgroup_members_collection_keys:
|
|
logicalgroup_record = logicalgroup_members_collection_dict.get(object_value)
|
|
# Extract ack information from ack_record
|
|
for key in logicalgroup_defaults.keys():
|
|
record[key] = logicalgroup_record.get(key, logicalgroup_defaults[key])
|
|
else:
|
|
# for key in logicalgroup_defaults, remove the key from record if exists
|
|
for key in logicalgroup_defaults.keys():
|
|
if key in record:
|
|
del record[key]
|
|
|
|
|
|
def set_feeds_lag_summary(record, component):
|
|
"""
|
|
Generates a lag summary based on the data_last_lag_seen and data_last_ingestion_lag_seen fields
|
|
"""
|
|
|
|
if component in ["dsm", "dhm"]:
|
|
|
|
try:
|
|
data_last_lag_seen = int(
|
|
round(float(record.get("data_last_lag_seen", 0)), 0)
|
|
)
|
|
except Exception as e:
|
|
data_last_lag_seen = 0
|
|
try:
|
|
data_last_ingestion_lag_seen = int(
|
|
round(float(record.get("data_last_ingestion_lag_seen", 0)), 0)
|
|
)
|
|
except Exception as e:
|
|
data_last_ingestion_lag_seen = 0
|
|
|
|
if data_last_lag_seen > 60 or data_last_ingestion_lag_seen < -60:
|
|
data_last_lag_seen_duration = (
|
|
f"{convert_seconds_to_duration(data_last_lag_seen)}"
|
|
)
|
|
elif data_last_lag_seen == 0:
|
|
data_last_lag_seen_duration = "0 sec"
|
|
elif data_last_lag_seen < 60:
|
|
data_last_lag_seen_duration = f"-{data_last_lag_seen} sec"
|
|
else:
|
|
data_last_lag_seen_duration = f"{data_last_lag_seen} sec"
|
|
|
|
if data_last_ingestion_lag_seen > 60 or data_last_ingestion_lag_seen < -60:
|
|
data_last_ingestion_lag_seen_duration = (
|
|
f"{convert_seconds_to_duration(data_last_ingestion_lag_seen)}"
|
|
)
|
|
elif data_last_ingestion_lag_seen == 0:
|
|
data_last_ingestion_lag_seen_duration = "0 sec"
|
|
elif data_last_ingestion_lag_seen < 60:
|
|
data_last_ingestion_lag_seen_duration = (
|
|
f"-{data_last_ingestion_lag_seen} sec"
|
|
)
|
|
else:
|
|
data_last_ingestion_lag_seen_duration = (
|
|
f"{data_last_ingestion_lag_seen} sec"
|
|
)
|
|
|
|
# return
|
|
lag_summary = (
|
|
f"{data_last_lag_seen_duration} / {data_last_ingestion_lag_seen_duration}"
|
|
)
|
|
|
|
elif component in ["mhm"]:
|
|
|
|
# original logic: lag_summary= if(last_lag_seen>60, tostring(last_lag_seen, "duration"), last_lag_seen . " sec")
|
|
|
|
try:
|
|
last_lag_seen = int(round(float(record.get("last_lag_seen", 0)), 0))
|
|
except Exception as e:
|
|
last_lag_seen = 0
|
|
if last_lag_seen > 60:
|
|
lag_summary = f"{convert_seconds_to_duration(last_lag_seen)}"
|
|
else:
|
|
lag_summary = f"{last_lag_seen} sec"
|
|
|
|
return lag_summary
|
|
|
|
|
|
def set_feeds_thresholds_duration(record):
|
|
|
|
try:
|
|
data_max_delay_allowed = int(
|
|
round(float(record.get("data_max_delay_allowed", 0)), 0)
|
|
)
|
|
except Exception as e:
|
|
data_max_delay_allowed = 0
|
|
data_max_delay_allowed_duration = convert_seconds_to_duration(
|
|
data_max_delay_allowed
|
|
)
|
|
try:
|
|
data_max_lag_allowed = int(
|
|
round(float(record.get("data_max_lag_allowed", 0)), 0)
|
|
)
|
|
except Exception as e:
|
|
data_max_lag_allowed = 0
|
|
data_max_lag_allowed_duration = convert_seconds_to_duration(data_max_lag_allowed)
|
|
|
|
return data_max_delay_allowed_duration, data_max_lag_allowed_duration
|
|
|
|
|
|
def set_cim_duration(record):
|
|
|
|
try:
|
|
tracker_last_duration = int(
|
|
round(float(record.get("tracker_last_duration", 0)), 0)
|
|
)
|
|
except Exception as e:
|
|
tracker_last_duration = 0
|
|
tracker_last_duration = convert_seconds_to_duration(tracker_last_duration)
|
|
|
|
return tracker_last_duration
|
|
|
|
|
|
def dsm_sampling_lookup(
|
|
object_value, sampling_collection_keys, sampling_collection_dict, record
|
|
):
|
|
"""
|
|
Updates record with ack information if object_value exists in sampling_collection_keys
|
|
and the object categories match.
|
|
"""
|
|
|
|
sampling_defaults = {
|
|
"data_sample_feature": "enabled",
|
|
"data_sample_status_message": {
|
|
"state": "pending",
|
|
"desc": "Data Sampling is pending and has not been performed yet for this entity",
|
|
},
|
|
"data_sample_status_colour": "N/A",
|
|
"data_sample_anomaly_reason": "N/A",
|
|
}
|
|
|
|
if object_value in sampling_collection_keys:
|
|
sampling_record = sampling_collection_dict.get(object_value)
|
|
|
|
current_data_sample_feature = sampling_record.get(
|
|
"data_sample_feature", "enabled"
|
|
)
|
|
|
|
if current_data_sample_feature == "enabled":
|
|
for key in sampling_defaults.keys():
|
|
record[key] = sampling_record.get(key, sampling_defaults[key])
|
|
elif current_data_sample_feature == "disabled_auto":
|
|
for key in sampling_defaults.keys():
|
|
record[key] = sampling_record.get(key, sampling_defaults[key])
|
|
else:
|
|
sampling_fields = {
|
|
"data_sample_feature": "disabled",
|
|
"data_sample_status_message": {
|
|
"state": "disabled",
|
|
"desc": "Data sampling features are currently disabled for this entity.",
|
|
},
|
|
"data_sample_status_colour": "grey",
|
|
"data_sample_anomaly_reason": "None",
|
|
}
|
|
for key in sampling_fields.keys():
|
|
record[key] = sampling_fields[key]
|
|
|
|
else:
|
|
record.update(sampling_defaults)
|
|
|
|
|
|
def outliers_data_lookup(
|
|
key_value,
|
|
outliers_data_collection_keys,
|
|
outliers_data_collection_dict,
|
|
outliers_rules_collection_keys,
|
|
outliers_rules_collection_dict,
|
|
record,
|
|
):
|
|
"""
|
|
Updates record with outliers information if object_value exists in outliers_data_collection_keys
|
|
and the object categories match.
|
|
"""
|
|
|
|
#
|
|
# Handle data
|
|
#
|
|
|
|
outliers_data_defaults = {
|
|
"isOutlier": 0,
|
|
"isOutlierReason": "",
|
|
"models_in_anomaly": "",
|
|
}
|
|
|
|
if key_value in outliers_data_collection_keys:
|
|
outliers_data_record = outliers_data_collection_dict.get(key_value)
|
|
for key in outliers_data_defaults.keys():
|
|
record[key] = outliers_data_record.get(key, outliers_data_defaults[key])
|
|
else:
|
|
record.update(outliers_data_defaults)
|
|
|
|
#
|
|
# Handle rules
|
|
#
|
|
|
|
outliers_rules_defaults = {
|
|
"is_disabled": 0,
|
|
}
|
|
|
|
if key_value in outliers_rules_collection_keys:
|
|
outliers_rules_record = outliers_rules_collection_dict.get(key_value)
|
|
|
|
for key in outliers_rules_defaults.keys():
|
|
if key == "is_disabled":
|
|
record["OutliersDisabled"] = outliers_rules_record.get(
|
|
key, outliers_rules_defaults[key]
|
|
)
|
|
else:
|
|
record[key] = outliers_rules_record.get(
|
|
key, outliers_rules_defaults[key]
|
|
)
|
|
else:
|
|
for key, value in outliers_rules_defaults.items():
|
|
if key == "is_disabled":
|
|
record["OutliersDisabled"] = value
|
|
else:
|
|
record[key] = value
|
|
|
|
|
|
def get_coll_docs_ref(collection, docs_collection_name):
|
|
collection_records = []
|
|
collection_records_dict = {}
|
|
collection_members_list = []
|
|
collection_members_dict = {}
|
|
|
|
end = False
|
|
skip_tracker = 0
|
|
while not end:
|
|
process_collection_records = collection.data.query(skip=skip_tracker)
|
|
if process_collection_records:
|
|
for item in process_collection_records:
|
|
collection_records.append(item)
|
|
collection_records_dict[item.get("_key")] = {
|
|
"doc_note": item.get("doc_note"),
|
|
"doc_link": item.get("doc_link"),
|
|
"object_members": item.get("object", []),
|
|
}
|
|
|
|
doc_members = item.get("object", [])
|
|
# add members in collection_members_list, also create a dict per member
|
|
for member in doc_members:
|
|
if member not in collection_members_list:
|
|
collection_members_list.append(member)
|
|
collection_members_dict[member] = {
|
|
"doc_key": item.get("_key"),
|
|
"doc_note": item.get("doc_note"),
|
|
"doc_link": item.get("doc_link"),
|
|
"object_members": item.get("object", []),
|
|
}
|
|
skip_tracker += 5000
|
|
else:
|
|
end = True
|
|
|
|
return (
|
|
collection_records,
|
|
collection_records_dict,
|
|
collection_members_list,
|
|
collection_members_dict,
|
|
)
|
|
|
|
|
|
def docs_ref_lookup(
|
|
docs_is_global,
|
|
docs_note_default_global,
|
|
docs_link_default_global,
|
|
object_value,
|
|
docs_members_collection_keys,
|
|
docs_members_collection_dict,
|
|
record,
|
|
):
|
|
"""
|
|
Updates record with docs ref information if object_value exists in docs_members_collection_keys.
|
|
"""
|
|
|
|
docs_defaults = {
|
|
"doc_is_global": docs_is_global,
|
|
"doc_note": docs_note_default_global,
|
|
"doc_link": docs_link_default_global,
|
|
}
|
|
|
|
if object_value in docs_members_collection_keys:
|
|
doc_record = docs_members_collection_dict.get(object_value)
|
|
|
|
# override doc_is_global to False
|
|
record["doc_is_global"] = False
|
|
|
|
# Extract information from record
|
|
for key in docs_defaults.keys():
|
|
if key != "doc_is_global":
|
|
record[key] = doc_record.get(key, docs_defaults[key])
|
|
else:
|
|
record.update(docs_defaults)
|
|
|
|
|
|
def wlk_disabled_apps_lookup(
|
|
app_value,
|
|
apps_enablement_collection_keys,
|
|
apps_enablement_collection_dict,
|
|
record,
|
|
):
|
|
"""
|
|
Updates record with apps_disabled ref information if object_value exists in apps_enablement_collection_keys.
|
|
"""
|
|
|
|
apps_enablement_defaults = {
|
|
"app_is_enabled": "True",
|
|
}
|
|
|
|
if app_value in apps_enablement_collection_keys:
|
|
lookup_record = apps_enablement_collection_dict.get(app_value)
|
|
# Extract ack information from record
|
|
record["app_is_enabled"] = lookup_record.get("enabled", "True")
|
|
else:
|
|
record.update(apps_enablement_defaults)
|
|
|
|
|
|
def wlk_versioning_lookup(
|
|
key_value,
|
|
versioning_collection_keys,
|
|
versioning_collection_dict,
|
|
record,
|
|
):
|
|
"""
|
|
Updates record with apps_disabled ref information if object_value exists in cron_exec_sequence_sec_collection_keys.
|
|
"""
|
|
|
|
versioning_defaults = {
|
|
"cron_exec_sequence_sec": 0,
|
|
"object_description": "No description",
|
|
"versioning_available": "False",
|
|
}
|
|
|
|
# lookup and override if found, otherwise do nothing
|
|
if key_value in versioning_collection_keys:
|
|
record["versioning_available"] = "True"
|
|
lookup_record = versioning_collection_dict.get(key_value)
|
|
|
|
for key in versioning_defaults.keys():
|
|
if key == "object_description":
|
|
lookup_record_description = lookup_record.get("description", None)
|
|
# if len of lookup_record_description is 0, then use the default
|
|
if lookup_record_description:
|
|
if len(lookup_record_description) == 0:
|
|
record["object_description"] = versioning_defaults[
|
|
"object_description"
|
|
]
|
|
else:
|
|
record["object_description"] = lookup_record_description
|
|
else:
|
|
record["object_description"] = versioning_defaults[
|
|
"object_description"
|
|
]
|
|
|
|
elif key == "versioning_available":
|
|
record["versioning_available"] = "True"
|
|
else:
|
|
record[key] = lookup_record.get(key, versioning_defaults[key])
|
|
|
|
logging.debug(
|
|
f'versioning found for object="{record.get("object")}", object_key="{record.get("keyid")}", using key_value="{key_value}"'
|
|
)
|
|
|
|
else:
|
|
record.update(versioning_defaults)
|
|
logging.debug(
|
|
f'no versioning found for object="{record.get("object")}", object_key="{record.get("keyid")}", using key_value="{key_value}"'
|
|
)
|
|
|
|
|
|
def wlk_orphan_lookup(
|
|
key_value,
|
|
orphan_collection_keys,
|
|
orphan_collection_dict,
|
|
record,
|
|
):
|
|
"""
|
|
Updates record with orphan ref information if key_value exists in orphan_collection_keys.
|
|
"""
|
|
|
|
orphan_defaults = {
|
|
"orphan": 0,
|
|
}
|
|
|
|
if key_value in orphan_collection_keys:
|
|
lookup_record = orphan_collection_dict.get(key_value)
|
|
record["orphan"] = lookup_record.get("orphan", 0)
|
|
else:
|
|
record.update(orphan_defaults)
|
|
|
|
|
|
def apply_blocklist(record, blocklist_not_regex, blocklist_regex):
|
|
"""
|
|
Determines whether a record should be appended based on blocklist rules.
|
|
|
|
:param record: The record to check.
|
|
:param blocklist_not_regex: Dict of blocklist rules without regex.
|
|
:param blocklist_regex: Dict of blocklist rules with regex.
|
|
:return: True if the record should be appended, False otherwise.
|
|
"""
|
|
|
|
def match_not_regex(field_value, rule):
|
|
"""
|
|
Check if a field value matches a non-regex blocklist rule.
|
|
"""
|
|
if isinstance(field_value, list):
|
|
return any(item == rule.get("object") for item in field_value)
|
|
else:
|
|
return field_value == rule.get("object")
|
|
|
|
def match_regex(field_value, rule):
|
|
"""
|
|
Check if a field value matches a regex blocklist rule.
|
|
"""
|
|
if isinstance(field_value, list):
|
|
return any(re.match(rule.get("object"), item) for item in field_value)
|
|
else:
|
|
return re.match(rule.get("object"), field_value)
|
|
|
|
# define index and add to the record, using data_index if available and turn into a list from csv
|
|
if "data_index" in record:
|
|
record["index"] = record["data_index"].split(",")
|
|
|
|
# same for data_sourcetype and sourcetype
|
|
if "data_sourcetype" in record:
|
|
record["sourcetype"] = record["data_sourcetype"].split(",")
|
|
|
|
# metric_category is called equally in record and blocklist, but it can be a list too
|
|
if "metric_category" in record:
|
|
record["metric_category"] = record["metric_category"].split(",")
|
|
|
|
# Check blocklist without regex
|
|
for _, rule in blocklist_not_regex.items():
|
|
object_category = rule.get("object_category")
|
|
if object_category in record and match_not_regex(record[object_category], rule):
|
|
return False # Match found in blocklist not regex, do not append
|
|
|
|
# Check blocklist with regex
|
|
for _, rule in blocklist_regex.items():
|
|
object_category = rule.get("object_category")
|
|
if object_category in record and match_regex(record[object_category], rule):
|
|
return False # Regex match found in blocklist, do not append
|
|
|
|
# before returning, remove index and sourcetype from the record
|
|
if "index" in record:
|
|
del record["index"]
|
|
if "sourcetype" in record:
|
|
del record["sourcetype"]
|
|
# turn metric_category is existing back to a comma separated string
|
|
if "metric_category" in record:
|
|
record["metric_category"] = ",".join(record["metric_category"])
|
|
|
|
return True # If no blocklist rules matched, append the record
|
|
|
|
|
|
def dsm_check_default_thresholds(record, trackme_conf):
|
|
"""
|
|
Verify that the record contains expected fields, if not or if they are None, set them to default values.
|
|
"""
|
|
|
|
# Define a dictionary for the DSM fields and their respective default values
|
|
fields_defaults = {
|
|
"data_max_delay_allowed": trackme_conf["splk_general"][
|
|
"splk_general_dsm_delay_default"
|
|
],
|
|
"data_max_lag_allowed": trackme_conf["splk_general"][
|
|
"splk_general_dsm_threshold_default"
|
|
],
|
|
"data_override_lagging_class": "false",
|
|
"allow_adaptive_delay": "true",
|
|
}
|
|
|
|
# Iterate through the fields to check their presence and value
|
|
for field, default_value in fields_defaults.items():
|
|
# Check if field is missing or explicitly set to None
|
|
if field not in record or record[field] is None:
|
|
record[field] = default_value
|
|
else:
|
|
# Additional checks for numeric fields to ensure they can be converted to float
|
|
if field in ["data_max_delay_allowed", "data_max_lag_allowed"]:
|
|
try:
|
|
# Attempt to convert to float to validate
|
|
record[field] = float(record[field])
|
|
except ValueError:
|
|
# Set to default if conversion fails
|
|
record[field] = default_value
|
|
|
|
|
|
def dhm_check_default_thresholds(record, trackme_conf):
|
|
"""
|
|
Verify that the record contains expected fields, if not or if they are None, set them to default values.
|
|
"""
|
|
|
|
# Define a dictionary for the DHM fields and their respective default values
|
|
fields_defaults = {
|
|
"data_max_delay_allowed": trackme_conf["splk_general"][
|
|
"splk_general_dhm_delay_default"
|
|
],
|
|
"data_max_lag_allowed": trackme_conf["splk_general"][
|
|
"splk_general_dhm_threshold_default"
|
|
],
|
|
"data_override_lagging_class": "false",
|
|
"allow_adaptive_delay": "true",
|
|
"splk_dhm_alerting_policy": "global_policy",
|
|
}
|
|
|
|
# Iterate through the fields to check their presence and value
|
|
for field, default_value in fields_defaults.items():
|
|
# Check if field is missing or explicitly set to None
|
|
if field not in record or record[field] is None:
|
|
record[field] = default_value
|
|
else:
|
|
# Additional checks for numeric fields to ensure they can be converted to float
|
|
if field in ["data_max_delay_allowed", "data_max_lag_allowed"]:
|
|
try:
|
|
# Attempt to convert to float to validate
|
|
record[field] = float(record[field])
|
|
except ValueError:
|
|
# Set to default if conversion fails
|
|
record[field] = default_value
|
|
|
|
|
|
def dynamic_priority_lookup(
|
|
key_value, priority_collection_keys, priority_collection_dict, record
|
|
):
|
|
"""
|
|
Updates record with dynamic priority information if key_value exists in priority_collection_keys.
|
|
"""
|
|
|
|
# get the value for priority_external and priority_reason
|
|
priority_external = record.get("priority_external", None)
|
|
priority_reason = record.get("priority_reason", "entity_managed")
|
|
|
|
# first, check the value of priority_updated, if does not exist in the record, set to 0 and update the record
|
|
try:
|
|
priority_updated = int(record["priority_updated"])
|
|
# valid option are 0 or 1, if not one of these, set to 0
|
|
if priority_updated not in [0, 1]:
|
|
priority_updated = 0
|
|
except Exception as e:
|
|
priority_updated = 0
|
|
|
|
# add to record as priority_updated
|
|
record["priority_updated"] = priority_updated
|
|
|
|
# priority policies always have precedence
|
|
if key_value in priority_collection_keys:
|
|
dynamic_priority_record = priority_collection_dict.get(key_value)
|
|
dynamic_priority = dynamic_priority_record.get("priority", None)
|
|
dynamic_priority_reason = dynamic_priority_record.get(
|
|
"priority_reason", "entity_managed"
|
|
)
|
|
|
|
# if we have a match, and a priority, then update the record, otherwise do nothing
|
|
if dynamic_priority:
|
|
|
|
# add to record as priority_policy_value
|
|
record["priority_policy_value"] = dynamic_priority
|
|
|
|
if priority_updated != 1:
|
|
record["priority"] = dynamic_priority
|
|
logging.debug(
|
|
f'match found applying dynamic priority="{dynamic_priority}" for object="{record.get("object")}", key="{key_value}", priority_reason="{dynamic_priority_reason}"'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'priority_updated is set to 1, skipping dynamic priority="{dynamic_priority}" for object="{record.get("object")}", key="{key_value}", priority_reason="{dynamic_priority_reason}"'
|
|
)
|
|
record["priority_policy_id"] = dynamic_priority_reason
|
|
record["priority_reason"] = f"priority policy id: {dynamic_priority_reason}"
|
|
|
|
# no match, set to default reason
|
|
else:
|
|
# if priority_reason contains "priority policy id" but we have no match, then set to default
|
|
# otherwise, keep the existing value, it could be externally managed
|
|
# also check that the fields is in record first
|
|
if "priority_reason" in record:
|
|
if "priority policy id" in record["priority_reason"]:
|
|
record["priority_reason"] = "entity_managed"
|
|
else:
|
|
record["priority_reason"] = "entity_managed"
|
|
|
|
elif priority_external:
|
|
|
|
# attempt to get priority_reason, it not set, define to "externally_managed"
|
|
priority_reason = record.get("priority_reason", "externally_managed")
|
|
|
|
# if priority_external is in one of low, medium, high, critical, pending, then update
|
|
if priority_external in ["low", "medium", "high", "critical", "pending"]:
|
|
if priority_updated != 1:
|
|
record["priority"] = priority_external
|
|
logging.debug(
|
|
f'applying external priority="{priority_external}" for object="{record.get("object")}", priority_reason="{priority_reason}"'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'priority_updated is set to 1, skipping external priority="{priority_external}" for object="{record.get("object")}", priority_reason="{priority_reason}"'
|
|
)
|
|
record["priority_reason"] = f"{priority_reason}"
|
|
|
|
else:
|
|
# if priority_external is not in one of low, medium, high, critical, pending, log a warning as we refused this value
|
|
logging.warning(
|
|
f'external priority="{priority_external}" for object="{record.get("object")}" is not in the list of allowed values, priority_reason="{priority_reason}"'
|
|
)
|
|
|
|
else:
|
|
# simply set priority_reason to the default value
|
|
record["priority_reason"] = "entity_managed"
|
|
logging.debug(
|
|
f'no match found for object="{record.get("object")}", key="{key_value}", priority_reason="{priority_reason}"'
|
|
)
|
|
|
|
|
|
def dynamic_tags_lookup(key_value, tags_collection_keys, tags_collection_dict, record):
|
|
"""
|
|
Updates record with dynamic tags information if key_value exists in tags_collection_keys.
|
|
"""
|
|
|
|
if key_value in tags_collection_keys:
|
|
dynamic_tags_record = tags_collection_dict.get(key_value)
|
|
dynamic_tags = dynamic_tags_record.get("tags_auto", None)
|
|
|
|
# if we have a match, then update the record, otherwise do nothing
|
|
if dynamic_tags:
|
|
record["tags_auto"] = dynamic_tags
|
|
logging.debug(
|
|
f'match found applying dynamic tags="{dynamic_tags}" for object="{record.get("object")}", key="{key_value}"'
|
|
)
|
|
|
|
|
|
def dynamic_sla_class_lookup(
|
|
key_value, sla_collection_keys, sla_collection_dict, record
|
|
):
|
|
"""
|
|
Updates record with dynamic priority information if key_value exists in priority_collection_keys.
|
|
"""
|
|
|
|
if key_value in sla_collection_keys:
|
|
dynamic_sla_record = sla_collection_dict.get(key_value)
|
|
dynamic_sla = dynamic_sla_record.get("sla_class", None)
|
|
|
|
# if we have a match, and a priority, then update the record, otherwise do nothing
|
|
if dynamic_sla:
|
|
record["sla_class"] = dynamic_sla
|
|
logging.debug(
|
|
f'match found applying dynamic sla_class="{dynamic_sla}" for object="{record.get("object")}", key="{key_value}"'
|
|
)
|
|
|
|
|
|
def get_sla_timer(record, sla_classes, sla_default_class):
|
|
"""
|
|
Calculates and render the sla_timer
|
|
"""
|
|
|
|
# a JSON object comntaining a summary for sla information for rendering purposes
|
|
sla_message_json = {}
|
|
|
|
# check sla_class (if not in the record, use sla_default_class)
|
|
sla_class = record.get("sla_class", None)
|
|
if not sla_class:
|
|
sla_class = sla_default_class
|
|
record["sla_class"] = sla_default_class
|
|
|
|
else:
|
|
# get sla_treshold from sla_classes, if the mentioned sla_class is not found in sla_classes, use the default and replace the record
|
|
if sla_class not in sla_classes:
|
|
sla_class = sla_default_class
|
|
record["sla_class"] = sla_default_class
|
|
|
|
# add to sla_message_json
|
|
sla_message_json["sla_class"] = sla_class
|
|
|
|
# get sla_threshold from sla_classes
|
|
try:
|
|
sla_threshold = int(sla_classes[sla_class]["sla_threshold"])
|
|
except Exception as e:
|
|
sla_threshold = 86400
|
|
|
|
# convert to sla_threshold_duration
|
|
record["sla_threshold_duration"] = convert_seconds_to_duration(sla_threshold)
|
|
|
|
# add to record
|
|
record["sla_threshold"] = sla_threshold
|
|
|
|
# add to sla_message_json
|
|
sla_message_json["object"] = record.get("object")
|
|
|
|
# Calculates
|
|
# for sla, we need to use the current object_state from the KVstore, rather than realtime calculate object_state
|
|
object_state = record.get("kvcurrent_object_state", "red")
|
|
|
|
# we will use the realtime object_state to manage a different SLA message if we detect that the KVstore object_state is not yet updated
|
|
realtime_object_state = record.get("object_state", "red")
|
|
|
|
sla_message_json["object_state"] = object_state
|
|
|
|
if object_state == "red":
|
|
|
|
try:
|
|
latest_flip_time = float(record.get("latest_flip_time", 0))
|
|
except Exception as e:
|
|
latest_flip_time = 0
|
|
|
|
if latest_flip_time > 0:
|
|
sla_timer = int(round(float(int(time.time()) - latest_flip_time), 0))
|
|
else:
|
|
sla_timer = 0
|
|
|
|
# add sla_timer
|
|
record["sla_timer"] = sla_timer
|
|
sla_message_json["sla_timer"] = sla_timer
|
|
|
|
# calculate sla_timer_duration
|
|
sla_timer_duration = convert_seconds_to_duration(sla_timer)
|
|
|
|
# add
|
|
record["sla_timer_duration"] = convert_seconds_to_duration(sla_timer)
|
|
sla_message_json["sla_timer_duration"] = sla_timer_duration
|
|
|
|
# SLA breached
|
|
sla_is_breached = 1 if sla_timer > sla_threshold else 0
|
|
record["sla_is_breached"] = sla_is_breached
|
|
sla_message_json["sla_is_breached"] = sla_is_breached
|
|
|
|
# add an sla_message
|
|
if sla_is_breached == 1:
|
|
|
|
sla_message = f"SLA breached, the entity has been in a red state for more than {convert_seconds_to_duration(sla_timer)} (sla_class: {sla_class}, sla_class_threshold: {convert_seconds_to_duration(sla_threshold)}, sla_timer_sec: {int(round(sla_timer, 0))} sec, sla_threshold_sec: {sla_threshold} sec)"
|
|
record["sla_message"] = sla_message
|
|
sla_message_json["sla_message"] = sla_message
|
|
|
|
else:
|
|
sla_message = f"SLA is not breached, the entity has been in a red state for {convert_seconds_to_duration(sla_timer)} (sla_class: {sla_class}, sla_class_threshold: {convert_seconds_to_duration(sla_threshold)}, sla_timer_sec: {int(round(sla_timer, 0))} sec, sla_threshold_sec: {sla_threshold} sec)"
|
|
record["sla_message"] = sla_message
|
|
sla_message_json["sla_message"] = sla_message
|
|
|
|
elif object_state == "green" and realtime_object_state == "red":
|
|
record["sla_timer"] = 0
|
|
sla_message_json["sla_timer"] = 0
|
|
|
|
record["sla_timer_duration"] = "0 sec"
|
|
sla_message_json["sla_timer_duration"] = "0 sec"
|
|
|
|
record["sla_is_breached"] = 0
|
|
sla_message_json["sla_is_breached"] = 0
|
|
|
|
record["sla_message"] = (
|
|
"SLA status refresh is pending, the realtime entity state is red and SLA will be reflected in the next minutes once the KVstore status is updated by trackers"
|
|
)
|
|
sla_message_json["sla_message"] = (
|
|
"SLA status refresh is pending, the realtime entity state is red and SLA will be reflected in the next minutes once the KVstore status is updated by trackers"
|
|
)
|
|
|
|
else:
|
|
record["sla_timer"] = 0
|
|
sla_message_json["sla_timer"] = 0
|
|
|
|
record["sla_timer_duration"] = "0 sec"
|
|
sla_message_json["sla_timer_duration"] = "0 sec"
|
|
|
|
record["sla_is_breached"] = 0
|
|
sla_message_json["sla_is_breached"] = 0
|
|
|
|
record["sla_message"] = "SLA is not breached, the entity is not in a red state"
|
|
sla_message_json["sla_message"] = (
|
|
"SLA is not breached, the entity is not in a red state"
|
|
)
|
|
|
|
# add sla_message_json to record
|
|
record["sla_message_json"] = sla_message_json
|
|
|
|
|
|
def flx_thresholds_lookup(object_value, key_value, record, thresholds_collection_dict):
|
|
"""
|
|
Updates record with dynamic thresholds information (field dynamic_thresholds) if key_value matches the object_id value in thresholds_collection_dict records.
|
|
|
|
ex:
|
|
|
|
{
|
|
"c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934": {
|
|
"metric_name": "soar.mem_used_pct",
|
|
"value": 80,
|
|
"operator": ">",
|
|
"condition_true": True,
|
|
"mtime": 1747012850.5604594,
|
|
"comment": "No comment for update.",
|
|
"object_id": "199fc4f889ff4946181bb00f56aad44c7580dd87691de699e1c0d2fc851a1ec5",
|
|
"_user": "nobody",
|
|
"_key": "c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934"
|
|
}
|
|
}
|
|
|
|
|
|
"""
|
|
|
|
dynamic_thresholds = {}
|
|
|
|
if thresholds_collection_dict:
|
|
for key, value in thresholds_collection_dict.items():
|
|
object_id = value.get("object_id", None)
|
|
if object_id and object_id == key_value:
|
|
# add the dynamic threshold record to the dynamic_thresholds dictionary
|
|
dynamic_thresholds[key] = thresholds_collection_dict[key]
|
|
|
|
# add the dynamic_thresholds dictionary to the record
|
|
record["dynamic_thresholds"] = dynamic_thresholds
|
|
|
|
return True
|
|
|
|
|
|
def flx_drilldown_searches_lookup(tenant_id, tracker_name, account, record, drilldown_searches_collection_dict):
|
|
|
|
if drilldown_searches_collection_dict:
|
|
|
|
# Helper function to expand tokens in drilldown_search string
|
|
def expand_tokens(search_string, record):
|
|
if not isinstance(search_string, str):
|
|
return search_string
|
|
|
|
result = ""
|
|
i = 0
|
|
while i < len(search_string):
|
|
if search_string[i] == "$":
|
|
# Look for closing $
|
|
end = search_string.find("$", i + 1)
|
|
if end != -1:
|
|
token = search_string[i + 1 : end]
|
|
# Handle $result.<token_name>$ format
|
|
if token.startswith("result."):
|
|
token_name = token[7:] # Remove "result." prefix
|
|
else:
|
|
token_name = token
|
|
# Replace if token_name exists in record
|
|
if token_name in record:
|
|
replacement = str(record[token_name])
|
|
result += replacement
|
|
else:
|
|
# No replacement, keep token as is
|
|
result += "$" + token + "$"
|
|
i = end + 1
|
|
else:
|
|
# No closing $, just add the rest
|
|
result += search_string[i:]
|
|
break
|
|
else:
|
|
result += search_string[i]
|
|
i += 1
|
|
return result
|
|
|
|
# Handle concurrent trackers: tracker_name can be a JSON array string, comma-separated string, or a simple string
|
|
tracker_names = []
|
|
|
|
if tracker_name:
|
|
if isinstance(tracker_name, str):
|
|
try:
|
|
# Try to parse as JSON array (concurrent tracker format from KVstore)
|
|
parsed_tracker_names = json.loads(tracker_name)
|
|
if isinstance(parsed_tracker_names, list):
|
|
tracker_names = parsed_tracker_names
|
|
else:
|
|
# Single tracker name as string
|
|
tracker_names = [tracker_name]
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Not a JSON array, check if it's a comma-separated string (aggregated format)
|
|
if "," in tracker_name:
|
|
# Comma-separated string, split and strip whitespace
|
|
tracker_names = [tn.strip() for tn in tracker_name.split(",") if tn.strip()]
|
|
else:
|
|
# Single tracker name as string
|
|
tracker_names = [tracker_name]
|
|
elif isinstance(tracker_name, list):
|
|
# Already a list
|
|
tracker_names = tracker_name
|
|
else:
|
|
# Fallback: convert to string and treat as single tracker
|
|
tracker_names = [str(tracker_name)]
|
|
|
|
# Collect all drilldown searches from all matching trackers
|
|
drilldown_searches_list = []
|
|
|
|
for tn in tracker_names:
|
|
# Normalize tracker name using the dedicated function
|
|
normalized_tracker_name = normalize_flx_tracker_name(tenant_id, tn)
|
|
|
|
# Look up all matching entries in the collection
|
|
for key, value in drilldown_searches_collection_dict.items():
|
|
if value.get("tracker_name") == normalized_tracker_name:
|
|
# get drilldown_search, drilldown_search_earliest, drilldown_search_latest
|
|
drilldown_search = value.get("drilldown_search")
|
|
drilldown_search_earliest = value.get("drilldown_search_earliest")
|
|
drilldown_search_latest = value.get("drilldown_search_latest")
|
|
|
|
if drilldown_search:
|
|
# expand tokens in drilldown_search if it's a string
|
|
expanded_search = expand_tokens(drilldown_search, record)
|
|
|
|
# Add to list with tracker name for reference
|
|
drilldown_searches_list.append({
|
|
"drilldown_search": expanded_search,
|
|
"drilldown_search_earliest": drilldown_search_earliest or "-24h",
|
|
"drilldown_search_latest": drilldown_search_latest or "now",
|
|
"tracker_name": normalized_tracker_name, # Include tracker name for UI display
|
|
})
|
|
|
|
# Store drilldown searches as array for UI to iterate over
|
|
if drilldown_searches_list:
|
|
record["drilldown_searches"] = drilldown_searches_list
|
|
|
|
# For backward compatibility, also set the first drilldown search as single values
|
|
# This ensures existing UI code that expects drilldown_search, drilldown_search_earliest, drilldown_search_latest still works
|
|
first_drilldown = drilldown_searches_list[0]
|
|
record["drilldown_search"] = first_drilldown["drilldown_search"]
|
|
record["drilldown_search_earliest"] = first_drilldown["drilldown_search_earliest"]
|
|
record["drilldown_search_latest"] = first_drilldown["drilldown_search_latest"]
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def flx_default_metrics_lookup(tenant_id, tracker_name, record, default_metrics_collection_dict):
|
|
|
|
if default_metrics_collection_dict:
|
|
|
|
# Handle concurrent trackers: tracker_name can be a JSON array string, comma-separated string, or a simple string
|
|
tracker_names = []
|
|
|
|
if tracker_name:
|
|
if isinstance(tracker_name, str):
|
|
try:
|
|
# Try to parse as JSON array (concurrent tracker format from KVstore)
|
|
parsed_tracker_names = json.loads(tracker_name)
|
|
if isinstance(parsed_tracker_names, list):
|
|
tracker_names = parsed_tracker_names
|
|
else:
|
|
# Single tracker name as string
|
|
tracker_names = [tracker_name]
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Not a JSON array, check if it's a comma-separated string (aggregated format)
|
|
if "," in tracker_name:
|
|
# Comma-separated string, split and strip whitespace
|
|
tracker_names = [tn.strip() for tn in tracker_name.split(",") if tn.strip()]
|
|
else:
|
|
# Single tracker name as string
|
|
tracker_names = [tracker_name]
|
|
elif isinstance(tracker_name, list):
|
|
# Already a list
|
|
tracker_names = tracker_name
|
|
else:
|
|
# Fallback: convert to string and treat as single tracker
|
|
tracker_names = [str(tracker_name)]
|
|
|
|
# Normalize all tracker names and collect all matching metric names
|
|
metric_names = []
|
|
seen_metrics = set() # Track unique metrics to avoid duplicates
|
|
|
|
for tn in tracker_names:
|
|
# Normalize tracker name using the dedicated function
|
|
normalized_tracker_name = normalize_flx_tracker_name(tenant_id, tn)
|
|
|
|
# Look up all matching entries in the collection
|
|
for key, value in default_metrics_collection_dict.items():
|
|
if value.get("tracker_name") == normalized_tracker_name:
|
|
metric_name = value.get("metric_name")
|
|
if metric_name and metric_name not in seen_metrics:
|
|
metric_names.append(metric_name)
|
|
seen_metrics.add(metric_name)
|
|
|
|
# Set default_metric based on number of metrics found
|
|
if metric_names:
|
|
# If only one metric, keep as string for backward compatibility
|
|
# If multiple metrics, return as array for UI multi-select support
|
|
if len(metric_names) == 1:
|
|
record["default_metric"] = metric_names[0]
|
|
else:
|
|
record["default_metric"] = metric_names
|
|
return True
|
|
|
|
# set to status
|
|
record["default_metric"] = "status"
|
|
return False
|
|
|
|
|
|
def flx_check_dynamic_thresholds(logger, dynamic_thresholds, metrics_record):
|
|
"""
|
|
Checks if the dynamic thresholds are breached and updates the record accordingly.
|
|
|
|
Returns:
|
|
- threshold_alert: 1 if one or more thresholds are in alert, 0 otherwise
|
|
- threshold_messages: list of messages indicating which thresholds are in alert
|
|
- threshold_scores: list of scores for breached thresholds (defaults to 100 if not specified)
|
|
"""
|
|
ops = {
|
|
">": operator.gt,
|
|
"<": operator.lt,
|
|
"==": operator.eq,
|
|
"!=": operator.ne,
|
|
">=": operator.ge,
|
|
"<=": operator.le,
|
|
}
|
|
|
|
if not isinstance(metrics_record, dict):
|
|
try:
|
|
metrics_record = json.loads(metrics_record)
|
|
except Exception as e:
|
|
logger.error(
|
|
f'metrics_record="{metrics_record}" value can not be converted to dict, exception="{e}"'
|
|
)
|
|
return 0, [], []
|
|
|
|
logger.debug(
|
|
f'starting function flx_check_dynamic_thresholds, dynamic_thresholds="{json.dumps(dynamic_thresholds, indent=2)}", metrics_record="{json.dumps(metrics_record, indent=2)}"'
|
|
)
|
|
|
|
threshold_alert = 0
|
|
threshold_messages = []
|
|
threshold_scores = []
|
|
|
|
for threshold_key, threshold in dynamic_thresholds.items():
|
|
|
|
logger.debug(
|
|
f'checking threshold_key="{threshold_key}", threshold="{json.dumps(threshold, indent=2)}"'
|
|
)
|
|
|
|
metric_name = threshold.get("metric_name")
|
|
op_str = threshold.get("operator")
|
|
condition_true = strict_interpret_boolean(threshold.get("condition_true"))
|
|
# Get threshold score, default to 100 if not present (for backward compatibility with existing records)
|
|
try:
|
|
threshold_score = int(threshold.get("score", 100))
|
|
except (TypeError, ValueError):
|
|
threshold_score = 100
|
|
|
|
if metric_name not in metrics_record:
|
|
logger.debug(
|
|
f'function flx_check_dynamic_thresholds, metric_name="{metric_name}" not found in metrics_record="{json.dumps(metrics_record, indent=2)}", skipping threshold (metric may not be available for this tracker)'
|
|
)
|
|
continue
|
|
|
|
if op_str not in ops:
|
|
logger.error(
|
|
f'functionflx_check_dynamic_thresholds, op_str="{op_str}" not found in ops'
|
|
)
|
|
continue
|
|
|
|
# threshold value can be a string referencing a field in the metrics_record, or a proper numerical value
|
|
threshold_num_parsed = False
|
|
|
|
# first, try to load the threshold value as a float
|
|
try:
|
|
threshold_value = float(threshold.get("value"))
|
|
threshold_num_parsed = True
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
# if failed, try to load the threshold value from the field value referenced in the threshold value
|
|
if not threshold_num_parsed and threshold.get("value") in metrics_record:
|
|
try:
|
|
threshold_value = float(metrics_record.get(threshold.get("value")))
|
|
threshold_num_parsed = True
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
# if both failed, log a warning message and skip the threshold
|
|
if not threshold_num_parsed:
|
|
logger.warning(
|
|
f'function flx_check_dynamic_thresholds threshold_value value can not be converted to float, skipping threshold_record="{json.dumps(threshold, indent=2)}"'
|
|
)
|
|
continue
|
|
|
|
try:
|
|
metric_value = float(metrics_record.get(metric_name, 0))
|
|
except (TypeError, ValueError):
|
|
logger.error(
|
|
f'function flx_check_dynamic_thresholds metric_value value can not be converted to float, skipping threshold_key="{threshold_key}"'
|
|
)
|
|
continue # Skip if value can't be converted to float
|
|
|
|
op_func = ops[op_str]
|
|
match = op_func(metric_value, threshold_value)
|
|
|
|
# Fixed logic: alert if the expected condition is NOT matched
|
|
should_alert = (condition_true and not match) or (not condition_true and match)
|
|
|
|
if should_alert:
|
|
threshold_alert = 1
|
|
threshold_messages.append(
|
|
f"Threshold condition is in alert: "
|
|
f"metric='{metric_name}', value={metric_value}, "
|
|
f"threshold={threshold_value}, operator='{op_str}', "
|
|
f"condition_true={condition_true}"
|
|
)
|
|
threshold_scores.append(threshold_score)
|
|
|
|
logger.debug(
|
|
f"function flx_check_dynamic_thresholds, Checking threshold '{threshold_key}' on metric '{metric_name}': value={metric_value}, threshold={threshold_value}, operator='{op_str}', condition_true={condition_true}, match={match}, should_alert={should_alert}"
|
|
)
|
|
|
|
return threshold_alert, threshold_messages, threshold_scores
|
|
|
|
|
|
def fqm_thresholds_lookup(object_value, key_value, record, thresholds_collection_dict):
|
|
"""
|
|
Updates record with dynamic thresholds information (field dynamic_thresholds) if key_value matches the object_id value in thresholds_collection_dict records.
|
|
|
|
ex:
|
|
|
|
{
|
|
"c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934": {
|
|
"metric_name": "soar.mem_used_pct",
|
|
"value": 80,
|
|
"operator": ">",
|
|
"condition_true": True,
|
|
"mtime": 1747012850.5604594,
|
|
"comment": "No comment for update.",
|
|
"object_id": "199fc4f889ff4946181bb00f56aad44c7580dd87691de699e1c0d2fc851a1ec5",
|
|
"_user": "nobody",
|
|
"_key": "c6745c4d9190e2f18bd83e4448a0584da54a832fa57dfd838b58940c8fced934"
|
|
}
|
|
}
|
|
|
|
|
|
"""
|
|
|
|
dynamic_thresholds = {}
|
|
|
|
if thresholds_collection_dict:
|
|
for key, value in thresholds_collection_dict.items():
|
|
object_id = value.get("object_id", None)
|
|
if object_id and object_id == key_value:
|
|
# add the dynamic threshold record to the dynamic_thresholds dictionary
|
|
dynamic_thresholds[key] = thresholds_collection_dict[key]
|
|
|
|
# add the dynamic_thresholds dictionary to the record
|
|
record["dynamic_thresholds"] = dynamic_thresholds
|
|
|
|
return True
|
|
|
|
|
|
def fqm_check_dynamic_thresholds(logger, dynamic_thresholds, metrics_record):
|
|
"""
|
|
Checks if the dynamic thresholds are breached and updates the record accordingly.
|
|
|
|
Returns:
|
|
- threshold_alert: 1 if one or more thresholds are in alert, 0 otherwise
|
|
- threshold_messages: list of messages indicating which thresholds are in alert
|
|
- threshold_scores: list of scores for breached thresholds (defaults to 100 if not specified)
|
|
"""
|
|
ops = {
|
|
">": operator.gt,
|
|
"<": operator.lt,
|
|
"==": operator.eq,
|
|
"!=": operator.ne,
|
|
">=": operator.ge,
|
|
"<=": operator.le,
|
|
}
|
|
|
|
if not isinstance(metrics_record, dict):
|
|
try:
|
|
metrics_record = json.loads(metrics_record)
|
|
except Exception as e:
|
|
logger.error(
|
|
f'metrics_record="{metrics_record}" value can not be converted to dict, exception="{e}"'
|
|
)
|
|
return 0, [], []
|
|
|
|
logger.debug(
|
|
f'starting function fqm_check_dynamic_thresholds, dynamic_thresholds="{json.dumps(dynamic_thresholds, indent=2)}", metrics_record="{json.dumps(metrics_record, indent=2)}"'
|
|
)
|
|
|
|
threshold_alert = 0
|
|
threshold_messages = []
|
|
threshold_scores = []
|
|
|
|
for threshold_key, threshold in dynamic_thresholds.items():
|
|
|
|
logger.debug(
|
|
f'checking threshold_key="{threshold_key}", threshold="{json.dumps(threshold, indent=2)}"'
|
|
)
|
|
|
|
metric_name = threshold.get("metric_name")
|
|
op_str = threshold.get("operator")
|
|
condition_true = strict_interpret_boolean(threshold.get("condition_true"))
|
|
# Get threshold score, default to 100 if not present (for backward compatibility with existing records)
|
|
try:
|
|
threshold_score = int(threshold.get("score", 100))
|
|
except (TypeError, ValueError):
|
|
threshold_score = 100
|
|
|
|
if metric_name not in metrics_record:
|
|
logger.debug(
|
|
f'function fqm_check_dynamic_thresholds, metric_name="{metric_name}" not found in metrics_record="{json.dumps(metrics_record, indent=2)}", skipping threshold (metric may not be available for this tracker)'
|
|
)
|
|
continue
|
|
|
|
if op_str not in ops:
|
|
logger.error(
|
|
f'functionfqm_check_dynamic_thresholds, op_str="{op_str}" not found in ops'
|
|
)
|
|
continue
|
|
|
|
try:
|
|
threshold_value = float(threshold.get("value"))
|
|
except (TypeError, ValueError):
|
|
logger.error(
|
|
f'function fqm_check_dynamic_thresholds threshold_value value can not be converted to float, skipping threshold_key="{threshold_key}"'
|
|
)
|
|
continue
|
|
|
|
try:
|
|
metric_value = float(metrics_record.get(metric_name, 0))
|
|
except (TypeError, ValueError):
|
|
logger.error(
|
|
f'function fqm_check_dynamic_thresholds metric_value value can not be converted to float, skipping threshold_key="{threshold_key}"'
|
|
)
|
|
continue # Skip if value can't be converted to float
|
|
|
|
op_func = ops[op_str]
|
|
match = op_func(metric_value, threshold_value)
|
|
|
|
# Fixed logic: alert if the expected condition is NOT matched
|
|
should_alert = (condition_true and not match) or (not condition_true and match)
|
|
|
|
if should_alert:
|
|
threshold_alert = 1
|
|
threshold_messages.append(
|
|
f"Threshold condition is in alert: "
|
|
f"metric='{metric_name}', value={metric_value}, "
|
|
f"threshold={threshold_value}, operator='{op_str}', "
|
|
f"condition_true={condition_true}"
|
|
)
|
|
threshold_scores.append(threshold_score)
|
|
|
|
logger.debug(
|
|
f"function fqm_check_dynamic_thresholds, Checking threshold '{threshold_key}' on metric '{metric_name}': value={metric_value}, threshold={threshold_value}, operator='{op_str}', condition_true={condition_true}, match={match}, should_alert={should_alert}"
|
|
)
|
|
|
|
return threshold_alert, threshold_messages, threshold_scores
|
|
|
|
|
|
def calculate_score(service, tenant_id, component):
|
|
"""
|
|
Calculates the score for each object_id based on outlier scoring metrics from the past 24 hours.
|
|
|
|
:param service: The Splunk service object.
|
|
:param tenant_id: The tenant ID to query scores for.
|
|
:param component: The component to query scores for.
|
|
:return: A dictionary keyed by object_id, where each value contains:
|
|
- score: The sum of scores for the object_id (float)
|
|
- score_outliers: The sum of scores for the object_id that are outliers (float)
|
|
- object: The object name (string)
|
|
- score_source: A list of scoring sources (list of strings)
|
|
"""
|
|
|
|
if not service:
|
|
logging.error('function calculate_score, service parameter is None or empty')
|
|
return {}
|
|
|
|
if not tenant_id:
|
|
logging.error('function calculate_score, tenant_id parameter is None or empty')
|
|
return {}
|
|
|
|
# Build the search query
|
|
search_query = remove_leading_spaces(
|
|
f"""
|
|
| mstats sum(trackme.scoring.score) as score where `trackme_metrics_idx({tenant_id})` tenant_id="{tenant_id}" object_category="{component}" by object_id, object, score_source
|
|
| eval score_outliers=if(match(score_source,"^(false_positive_outlier$|lowerbound_outlier|upperbound_outlier)"),score,null())
|
|
| stats sum(score) as score, sum(score_outliers) as score_outliers, values(score_source) as score_source by object_id, object
|
|
"""
|
|
)
|
|
|
|
# Search parameters for past 24 hours
|
|
kwargs_search = {
|
|
"earliest_time": "-24h",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# Initialize the result dictionary
|
|
scores_dict = {}
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
|
|
logging.debug(
|
|
f'function calculate_score, tenant_id="{tenant_id}", component="{component}", search_query="{search_query}", kwargs_search="{json.dumps(kwargs_search, indent=2)}"'
|
|
)
|
|
|
|
# Execute the search
|
|
reader = run_splunk_search(
|
|
service,
|
|
search_query,
|
|
kwargs_search,
|
|
24, # max_retries
|
|
5, # sleep_time
|
|
)
|
|
|
|
# Process results
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
object_id = item.get("object_id")
|
|
if object_id:
|
|
# Get score, defaulting to 0 if not present or invalid
|
|
try:
|
|
score = float(item.get("score", 0))
|
|
except (TypeError, ValueError):
|
|
score = 0.0
|
|
|
|
# Get score_outliers, defaulting to 0 if not present or invalid
|
|
try:
|
|
score_outliers = float(item.get("score_outliers", 0))
|
|
except (TypeError, ValueError):
|
|
score_outliers = 0.0
|
|
|
|
# Get object name
|
|
object_name = item.get("object", "")
|
|
|
|
# Get score_source - it may be a string or a list
|
|
score_source_raw = item.get("score_source")
|
|
if isinstance(score_source_raw, list):
|
|
score_source_list = score_source_raw
|
|
elif isinstance(score_source_raw, str):
|
|
score_source_list = [score_source_raw]
|
|
else:
|
|
score_source_list = []
|
|
|
|
# Store in dictionary
|
|
scores_dict[object_id] = {
|
|
"score": score,
|
|
"score_outliers": score_outliers,
|
|
"object": object_name,
|
|
"score_source": score_source_list,
|
|
}
|
|
|
|
runtime = round(time.time() - start_time, 3)
|
|
logging.debug(
|
|
f'function calculate_score, tenant_id="{tenant_id}", '
|
|
f'no_objects="{len(scores_dict)}", run_time="{runtime}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'function calculate_score, tenant_id="{tenant_id}", '
|
|
f'failed with exception="{str(e)}"'
|
|
)
|
|
# Return empty dict on error
|
|
return {}
|
|
|
|
return scores_dict
|