You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmedecisionmaker.py

2510 lines
128 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library
import os
import sys
import json
import time
# Logging
import logging
from logging.handlers import RotatingFileHandler
# Networking
import urllib3
import requests
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
"%s/var/log/splunk/trackme_decision_maker.log" % splunkhome,
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# import Splunk
from splunklib.searchcommands import (
dispatch,
StreamingCommand,
Configuration,
Option,
validators,
)
# Import trackme libs
from trackme_libs import trackme_reqinfo, trackme_vtenant_account
# import TrackMe get data libs
from trackme_libs_get_data import (
get_feeds_datagen_kv_collection,
get_sampling_kv_collection,
search_kv_collection_restmode,
search_kv_collection_searchmode,
search_kv_collection_sdkmode,
)
# Import trackme decisionmaker libs
from trackme_libs_decisionmaker import (
convert_epoch_to_datetime,
get_monitoring_time_status,
get_outliers_status,
get_data_sampling_status,
get_future_status,
get_future_metrics_status,
get_is_under_dcount_host,
get_logical_groups_collection_records,
get_dsm_latency_status,
get_dsm_delay_status,
set_dsm_status,
set_dhm_status,
set_mhm_status,
set_flx_status,
set_fqm_status,
set_wlk_status,
apply_blocklist,
dynamic_priority_lookup,
dynamic_tags_lookup,
dynamic_sla_class_lookup,
get_sla_timer,
dsm_sampling_lookup,
sampling_anomaly_status,
flx_thresholds_lookup,
fqm_thresholds_lookup,
flx_check_dynamic_thresholds,
fqm_check_dynamic_thresholds,
flx_drilldown_searches_lookup,
flx_default_metrics_lookup,
calculate_score,
)
# import trackme libs disruption queue
from trackme_libs_disruption_queue import (
disruption_queue_lookup,
disruption_queue_update,
disruption_queue_get_duration,
)
# Import TrackMe splk-flx libs
from trackme_libs_splk_flx import trackme_flx_gen_metrics
# Import TrackMe splk-fqm libs
from trackme_libs_splk_fqm import trackme_fqm_gen_metrics
# Import TrackMe utils libs
from trackme_libs_utils import get_uuid
@Configuration(distributed=False)
class TrackMeDecisionMaker(StreamingCommand):
tenant_id = Option(
doc="""
**Syntax:** **tenant_id=****
**Description:** The tenant identifier.""",
require=True,
default=None,
)
component = Option(
doc="""
**Syntax:** **component=****
**Description:** Specify the TrackMe component.""",
require=True,
default=None,
validate=validators.Match("component", r"^(dsm|dhm|mhm|wlk|flx|fqm)$"),
)
"""
This function ensures that records have the same list of fields to allow Splunk to automatically extract these fields
If a given result does not have a given field, it will be added to the record as an empty value
"""
def generate_fields(self, records):
all_keys = set()
for record in records:
all_keys.update(record.keys())
for record in records:
for key in all_keys:
if key not in record:
record[key] = ""
yield record
def get_tenant_metric_idx(self):
# Define an header for requests authenticated communications with splunkd
header = {
"Authorization": "Splunk %s" % self._metadata.searchinfo.session_key,
"Content-Type": "application/json",
}
# get the index conf for this tenant
url = "%s/services/trackme/v2/vtenants/tenant_idx_settings" % (
self._metadata.searchinfo.splunkd_uri
)
data = {"tenant_id": self.tenant_id, "idx_stanza": "trackme_metric_idx"}
# Retrieve and set the tenant idx, if any failure, logs and use the global index
try:
response = requests.post(
url,
headers=header,
data=json.dumps(data, indent=1),
verify=False,
timeout=600,
)
if response.status_code not in (200, 201, 204):
error_msg = f'instance_id={self.instance_id}, failed to retrieve the tenant metric index, response.status_code="{response.status_code}", response.text="{response.text}"'
logging.error(error_msg)
raise Exception(error_msg)
else:
response_data = json.loads(json.dumps(response.json(), indent=1))
tenant_trackme_metric_idx = response_data["trackme_metric_idx"]
except Exception as e:
error_msg = (
f'instance_id={self.instance_id}, failed to retrieve the tenant metric index, exception="{str(e)}"'
)
logging.error(error_msg)
raise Exception(error_msg)
return tenant_trackme_metric_idx
"""
Stream function
"""
def stream(self, records):
# Start performance counter
start = time.time()
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
)
log.setLevel(reqinfo["logging_level"])
# set instance_id
self.instance_id = get_uuid()
# Get virtual tenant account
vtenant_conf = trackme_vtenant_account(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
# get metric index
metric_index = self.get_tenant_metric_idx()
#
# System level settings
#
system_future_tolerance = float(
reqinfo["trackme_conf"]["splk_general"][
"splk_general_feeds_future_tolerance"
]
)
#
# System level default minimal disruption period
#
default_disruption_min_time_sec = int(
vtenant_conf["default_disruption_min_time_sec"]
)
#
# Tenant level default monitoring time policy
#
try:
default_monitoring_time_policy = vtenant_conf["monitoring_time_policy"]
except Exception as e:
default_monitoring_time_policy = "all_time"
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_priority_collection_records"
# dynamic priority, for all components
# get priority collection
priority_collection_name = (
f"kv_trackme_{self.component}_priority_tenant_{self.tenant_id}"
)
priority_collection = self.service.kvstore[priority_collection_name]
(
priority_records,
priority_collection_keys,
priority_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, priority_collection_name, page=1, page_count=0, orderby="keyid"
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_tags_collection_records"
# get tags collection
tags_collection_name = (
f"kv_trackme_{self.component}_tags_tenant_{self.tenant_id}"
)
tags_collection = self.service.kvstore[tags_collection_name]
(
tags_records,
tags_collection_keys,
tags_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, tags_collection_name, page=1, page_count=0, orderby="keyid"
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_sla_collection_records"
# dynamic sla_class, for all components
# get sla collection
sla_collection_name = f"kv_trackme_{self.component}_sla_tenant_{self.tenant_id}"
sla_collection = self.service.kvstore[sla_collection_name]
(
sla_records,
sla_collection_keys,
sla_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, sla_collection_name, page=1, page_count=0, orderby="keyid"
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_disruption_queue_collection_records"
# get disruption queue collection
disruption_queue_collection_name = (
f"kv_trackme_common_disruption_queue_tenant_{self.tenant_id}"
)
disruption_queue_collection = self.service.kvstore[
disruption_queue_collection_name
]
(
disruption_queue_records,
disruption_queue_collection_keys,
disruption_queue_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, disruption_queue_collection_name, page=1, page_count=0, orderby="keyid"
)
logging.debug(
f'instance_id={self.instance_id}, disruption_queue_collection_dict="{json.dumps(disruption_queue_collection_dict, indent=2)}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
#
# SLA timer
#
sla_classes = {}
sla_default_class = None
sla_classes = reqinfo["trackme_conf"]["sla"]["sla_classes"]
# try loading the JSON
try:
sla_classes = json.loads(sla_classes)
sla_default_class = reqinfo["trackme_conf"]["sla"]["sla_default_class"]
if not len(sla_default_class) > 0 or sla_default_class not in sla_classes:
sla_default_class = "silver"
logging.error(
f'instance_id={self.instance_id}, Invalid sla_default_class="{sla_default_class}", this SLA class is not part of the SLA classes, applying fallback configuration'
)
except:
logging.error(
f'instance_id={self.instance_id}, Error loading sla_classes JSON, please check the configuration, the JSON is not valid JSON, applying fallback configuration, exception="{str(e)}"'
)
sla_classes = json.loads(
'{"gold": {"sla_threshold": 14400, "rank": 3}, "silver": {"sla_threshold": 86400, "rank": 2}, "platinum": {"sla_threshold": 172800, "rank": 1}}'
)
sla_default_class = "silver"
# retrieve the score for the tenant and component
scores_dict = calculate_score(self.service, self.tenant_id, self.component)
logging.info(
f'instance_id={self.instance_id}, scores_dict="{json.dumps(scores_dict, indent=2)}"'
)
#
# splk-dsm specific collections
#
if self.component == "dsm":
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_sampling_collection_records"
# Data sampling
sampling_collection_name = (
f"kv_trackme_dsm_data_sampling_tenant_{self.tenant_id}"
)
sampling_collection = self.service.kvstore[sampling_collection_name]
sampling_records, sampling_collection_keys, sampling_collection_dict = (
get_sampling_kv_collection(
sampling_collection, sampling_collection_name
)
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# dhm specific
if self.component == "dhm":
macro_name = (
f"trackme_dhm_default_splk_dhm_alert_policy_tenant_{self.tenant_id}"
)
macro_current = self.service.confs["macros"][macro_name]
default_splk_dhm_alerting_policy = macro_current.content.get("definition")
# remove double quotes from default_splk_dhm_alerting_policy
default_splk_dhm_alerting_policy = default_splk_dhm_alerting_policy.replace(
'"', ""
)
logging.debug(
f'instance_id={self.instance_id}, default_splk_dhm_alerting_policy="{default_splk_dhm_alerting_policy}"'
)
#
# component specific collections
#
if self.component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk"]:
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_datagen_collection_records"
# datagen
datagen_collection_name = (
f"kv_trackme_{self.component}_allowlist_tenant_{self.tenant_id}"
)
datagen_collection = self.service.kvstore[datagen_collection_name]
(
datagen_records,
datagen_collection_keys,
datagen_collection_dict,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
) = get_feeds_datagen_kv_collection(
datagen_collection, datagen_collection_name, self.component
)
logging.debug(
f'instance_id={self.instance_id}, datagen_collection_dict="{json.dumps(datagen_collection_dict, indent=2)}"'
)
logging.debug(
f'instance_id={self.instance_id}, datagen_collection_blocklist_not_regex_dict="{json.dumps(datagen_collection_blocklist_not_regex_dict, indent=2)}"'
)
logging.debug(
f'instance_id={self.instance_id}, datagen_collection_blocklist_regex_dict="{json.dumps(datagen_collection_blocklist_regex_dict, indent=2)}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
#
# splk-flx specific collections
#
if self.component == "flx":
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_thresholds_collection_records"
# Thresholds
thresholds_collection_name = (
f"kv_trackme_flx_thresholds_tenant_{self.tenant_id}"
)
thresholds_collection = self.service.kvstore[thresholds_collection_name]
(
thresholds_records,
thresholds_collection_keys,
thresholds_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, thresholds_collection_name, page=1, page_count=0, orderby="keyid"
)
logging.debug(
f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_drilldown_searches_collection_records"
# Drilldown searches
drilldown_searches_collection_name = (
f"kv_trackme_flx_drilldown_searches_tenant_{self.tenant_id}"
)
drilldown_searches_collection = self.service.kvstore[drilldown_searches_collection_name]
(
drilldown_searches_records,
drilldown_searches_collection_keys,
drilldown_searches_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, drilldown_searches_collection_name, page=1, page_count=0, orderby="keyid"
)
logging.debug(
f'instance_id={self.instance_id}, drilldown_searches_collection_dict="{json.dumps(drilldown_searches_collection_dict, indent=2)}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_default_metrics_collection_records"
# Default metrics
default_metrics_collection_name = (
f"kv_trackme_flx_default_metric_tenant_{self.tenant_id}"
)
default_metrics_collection = self.service.kvstore[default_metrics_collection_name]
(
default_metrics_records,
default_metrics_collection_keys,
default_metrics_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, default_metrics_collection_name, page=1, page_count=0, orderby="keyid"
)
logging.debug(
f'instance_id={self.instance_id}, default_metrics_collection_dict="{json.dumps(default_metrics_collection_dict, indent=2)}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
#
# splk-fqm specific collections
#
if self.component == "fqm":
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_thresholds_collection_records"
# Thresholds
thresholds_collection_name = (
f"kv_trackme_fqm_thresholds_tenant_{self.tenant_id}"
)
thresholds_collection = self.service.kvstore[thresholds_collection_name]
(
thresholds_records,
thresholds_collection_keys,
thresholds_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logging, self.service, thresholds_collection_name, page=1, page_count=0, orderby="keyid"
)
logging.debug(
f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
#
# Virtual tenant account settings
#
# outliers tenant level settings
# outliers tenant level settings (deprecated - kept for backward compatibility)
# These are no longer used with score-based approach, but kept for backward compatibility
tenant_outliers_set_state = int(vtenant_conf.get("outliers_set_state", 1))
tenant_data_sampling_set_state = int(vtenant_conf.get("data_sampling_set_state", 1))
#
# Logical groups collection records
#
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "get_logical_groups_collection_records"
logical_group_coll = self.service.kvstore[
f"kv_trackme_common_logical_group_tenant_{self.tenant_id}"
]
(
logical_coll_records,
logical_coll_dict,
logical_coll_members_list,
logical_coll_members_dict,
logical_coll_count,
) = get_logical_groups_collection_records(logical_group_coll)
# log debug
logging.debug(
f'instance_id={self.instance_id}, function get_logical_groups_collection_records, logical_coll_dict="{json.dumps(logical_coll_dict, indent=2)}", logical_coll_count="{logical_coll_count}"'
)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# Process records
processed_records = []
records_count = 0
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "process_records"
for record in records:
records_count += 1
try:
new_record = {}
# append_record boolean, True by default unless specific use cases
append_record = True
# get object_value and key
object_value = record.get("object", None)
logging.debug(
f'instance_id={self.instance_id}, object="{object_value}", record="{json.dumps(record, indent=2)}"'
)
# save the current value of object_state in the record as kvcurrent_object_state, we manipulate real state calculations
# and we need the original state in some conditions (sla)
record["kvcurrent_object_state"] = record.get("object_state", "N/A")
# The value for key is normally in the field keyid, but in some cases it is in the field key or _key
# use keyid, key, _key in that order of preference
if "keyid" in record:
key_value = record.get("keyid", None)
elif "object_id" in record:
key_value = record.get("object_id", None)
elif "key" in record:
key_value = record.get("key", None)
elif "_key" in record:
key_value = record.get("_key", None)
else:
key_value = None
# get the score for the object and add to the record
try:
score = int(scores_dict.get(key_value, {}).get("score", 0))
except:
score = 0
try:
score_outliers = int(scores_dict.get(key_value, {}).get("score_outliers", 0))
except:
score_outliers = 0
score_source = scores_dict.get(key_value, {}).get("score_source", [])
record["score"] = score
record["score_outliers"] = score_outliers
record["score_source"] = score_source
#
# Dynamic priority
#
dynamic_priority_lookup(
key_value,
priority_collection_keys,
priority_collection_dict,
record,
)
#
# Dynamic tags
#
dynamic_tags_lookup(
key_value,
tags_collection_keys,
tags_collection_dict,
record,
)
#
# Dynamic sla_class
#
dynamic_sla_class_lookup(
key_value,
sla_collection_keys,
sla_collection_dict,
record,
)
#
# Disruption queue
#
# Aggregate disruption_min_time_sec: take maximum value across all trackers
aggregated_disruption_min_time_sec = default_disruption_min_time_sec
if "disruption_min_time_sec" in record:
try:
disruption_min_time_value = record.get("disruption_min_time_sec")
if disruption_min_time_value:
disruption_times_by_tracker = None
# Parse if it's a JSON string
if isinstance(disruption_min_time_value, str):
try:
disruption_times_by_tracker = json.loads(disruption_min_time_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format numeric value
try:
aggregated_disruption_min_time_sec = max(
default_disruption_min_time_sec,
int(float(disruption_min_time_value))
)
except (ValueError, TypeError):
pass
elif isinstance(disruption_min_time_value, dict):
disruption_times_by_tracker = disruption_min_time_value
else:
# Numeric value (old format)
try:
aggregated_disruption_min_time_sec = max(
default_disruption_min_time_sec,
int(float(disruption_min_time_value))
)
except (ValueError, TypeError):
pass
# If tracker-keyed format, take maximum across all trackers
if disruption_times_by_tracker and isinstance(disruption_times_by_tracker, dict):
max_disruption_time = max(
int(float(v)) for v in disruption_times_by_tracker.values()
)
aggregated_disruption_min_time_sec = max(
default_disruption_min_time_sec,
max_disruption_time
)
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'failed to aggregate disruption_min_time_sec, exception="{str(e)}"'
)
disruption_queue_record = disruption_queue_lookup(
key_value,
disruption_queue_collection_keys,
disruption_queue_collection_dict,
aggregated_disruption_min_time_sec,
)
if disruption_queue_record:
logging.debug(
f'instance_id={self.instance_id}, disruption_queue_record="type={type(disruption_queue_record)}, {json.dumps(disruption_queue_record, indent=2)}"'
)
#
# splk-dsm
#
# get record fields depending on the component
if self.component == "dsm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get outliers and data sampling
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
try:
isAnomaly = int(record.get("isAnomaly", 0))
except:
isAnomaly = 0
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"'
)
# get future_tolerance
future_tolerance = record.get("future_tolerance", 0)
try:
future_tolerance = float(future_tolerance)
except:
future_tolerance = 0
#
# DSM Sampling
#
# call function dsm_sampling_lookup
dsm_sampling_lookup(
object_value,
sampling_collection_keys,
sampling_collection_dict,
record,
)
# check the value of allow_adaptive_delay (accepted values: true, false - as string)
allow_adaptive_delay = record.get("allow_adaptive_delay", "true")
if allow_adaptive_delay not in ["true", "false"]:
# log a warning
logging.warning(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", allow_adaptive_delay="{allow_adaptive_delay}" is not a valid value (accepted values: true, false), setting to "true"'
)
allow_adaptive_delay = "true"
# update the record
record["allow_adaptive_delay"] = "true"
# get actual primary KPI values
data_last_ingestion_lag_seen = record.get(
"data_last_ingestion_lag_seen", 0
)
if data_last_ingestion_lag_seen == "":
data_last_ingestion_lag_seen = 0
try:
data_last_ingestion_lag_seen = float(
data_last_ingestion_lag_seen
)
except:
data_last_ingestion_lag_seen = 0
data_last_lag_seen = record.get("data_last_lag_seen", 0)
# get per entity thresholds
data_max_lag_allowed = float(
record.get("data_max_lag_allowed", 0)
)
data_max_delay_allowed = float(
record.get("data_max_delay_allowed", 0)
)
min_dcount_threshold = record.get("min_dcount_threshold", 0)
try:
min_dcount_threshold = float(min_dcount_threshold)
except:
min_dcount_threshold = 0
# get dcount host related information
min_dcount_host = record.get("min_dcount_host", "any")
try:
min_dcount_host = float(min_dcount_host)
except:
pass
min_dcount_field = record.get("min_dcount_field", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime)
data_last_ingest = record.get("data_last_ingest", 0)
try:
data_last_ingest = float(data_last_ingest)
except:
pass
data_last_time_seen = record.get("data_last_time_seen", 0)
if data_last_time_seen == "":
data_last_time_seen = 0
try:
data_last_time_seen = float(data_last_time_seen)
except:
data_last_time_seen = 0
data_last_time_seen_idx = record.get(
"data_last_time_seen_idx", 0
)
try:
data_last_time_seen_idx = float(data_last_time_seen_idx)
except:
pass
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# Get score data for this object_id (key_value) from scores_dict
score_data = scores_dict.get(key_value, {})
score = score_data.get("score", 0)
score_outliers = score_data.get("score_outliers", 0)
# call get_outliers_status and define isOutlier (with hybrid scoring)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# call get_data_sampling_status and define isAnomaly
isAnomaly = get_data_sampling_status(
record.get("data_sample_status_colour"),
record.get("data_sample_feature"),
tenant_data_sampling_set_state,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isAnomaly="{isAnomaly}", tenant_data_sampling_set_state="{tenant_data_sampling_set_state}"'
)
# call get_future_status and define isFuture
(
isFuture,
isFutureMsg,
merged_future_tolerance,
) = get_future_status(
future_tolerance,
system_future_tolerance,
data_last_lag_seen,
data_last_ingestion_lag_seen,
data_last_time_seen,
data_last_ingest,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"'
)
# call get_is_under_dcount_host and define isUnderDcountHost
(
isUnderDcountHost,
isUnderDcountHostMsg,
) = get_is_under_dcount_host(
min_dcount_host, min_dcount_threshold, min_dcount_field
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDcountHost="{isUnderDcountHost}", isUnderDcountHostMsg="{isUnderDcountHostMsg}", min_dcount_host="{min_dcount_host}", min_dcount_threshold="{min_dcount_threshold}"'
)
# call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage
(
isUnderLatencyAlert,
isUnderLatencyMessage,
) = get_dsm_latency_status(
data_last_ingestion_lag_seen,
data_max_lag_allowed,
data_last_ingest,
data_last_time_seen,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage
(
isUnderDelayAlert,
isUnderDelayMessage,
) = get_dsm_delay_status(
data_last_lag_seen,
data_max_delay_allowed,
data_last_ingest,
data_last_time_seen,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# call set_dsm_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_dsm_status(
logging,
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
record,
isOutlier,
isAnomaly,
isFuture,
isFutureMsg,
isUnderMonitoring,
isUnderMonitoringMsg,
isUnderDcountHost,
isUnderDcountHostMsg,
object_logical_group_dict,
isUnderLatencyAlert,
isUnderLatencyMessage,
isUnderDelayAlert,
isUnderDelayMessage,
disruption_queue_collection,
disruption_queue_record,
source_handler="trackmedecisionmaker",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logging.debug(
f'instance_id={self.instance_id}, set_dsm_status, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
new_record["object_state"] = object_state
new_record["status_message"] = " | ".join(status_message)
new_record["status_message_json"] = status_message_json
new_record["anomaly_reason"] = "|".join(anomaly_reason)
# future tolerance
try:
new_record["future_tolerance"] = int(
round(merged_future_tolerance, 0)
)
except:
new_record["future_tolerance"] = -600
# convert data_last_time_seen to last_time from epoch
last_time = convert_epoch_to_datetime(data_last_time_seen)
new_record["last_time"] = last_time
# convert data_last_ingest to last_ingest from epoch
last_ingest = convert_epoch_to_datetime(data_last_ingest)
new_record["last_ingest"] = last_ingest
# convert data_last_time_seen_idx to last_time_idx from epoch
last_time_idx = convert_epoch_to_datetime(data_last_time_seen)
new_record["last_time_idx"] = last_time_idx
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
new_record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-dhm
#
elif self.component == "dhm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get splk_dhm_st_summary
splk_dhm_st_summary = record.get("splk_dhm_st_summary", None)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", splk_dhm_st_summary="{splk_dhm_st_summary}"'
)
# get outliers and data sampling
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
try:
isAnomaly = int(record.get("isAnomaly", 0))
except:
isAnomaly = 0
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"'
)
# get future_tolerance
future_tolerance = record.get("future_tolerance", 0)
try:
future_tolerance = float(future_tolerance)
except:
future_tolerance = 0
# check the value of allow_adaptive_delay (accepted values: true, false - as string)
allow_adaptive_delay = record.get("allow_adaptive_delay", "true")
if allow_adaptive_delay not in ["true", "false"]:
# log a warning
logging.warning(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", allow_adaptive_delay="{allow_adaptive_delay}" is not a valid value (accepted values: true, false), setting to "true"'
)
allow_adaptive_delay = "true"
# update the record
record["allow_adaptive_delay"] = "true"
# get actual primary KPI values
data_last_ingestion_lag_seen = record.get(
"data_last_ingestion_lag_seen", 0
)
if data_last_ingestion_lag_seen == "":
data_last_ingestion_lag_seen = 0
try:
data_last_ingestion_lag_seen = float(
data_last_ingestion_lag_seen
)
except:
data_last_ingestion_lag_seen = 0
data_last_lag_seen = record.get("data_last_lag_seen", 0)
# get per entity thresholds
data_max_lag_allowed = float(
record.get("data_max_lag_allowed", 0)
)
data_max_delay_allowed = float(
record.get("data_max_delay_allowed", 0)
)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime)
data_last_ingest = record.get("data_last_ingest", 0)
try:
data_last_ingest = float(data_last_ingest)
except:
pass
data_last_time_seen = record.get("data_last_time_seen", 0)
if data_last_time_seen == "":
data_last_time_seen = 0
try:
data_last_time_seen = float(data_last_time_seen)
except:
data_last_time_seen = 0
data_last_time_seen_idx = record.get(
"data_last_time_seen_idx", 0
)
try:
data_last_time_seen_idx = float(data_last_time_seen_idx)
except:
pass
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# Get score data for this object_id (key_value) from scores_dict
score_data = scores_dict.get(key_value, {})
score = score_data.get("score", 0)
score_outliers = score_data.get("score_outliers", 0)
# call get_outliers_status and define isOutlier (with hybrid scoring)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# call get_future_status and define isFuture
(
isFuture,
isFutureMsg,
merged_future_tolerance,
) = get_future_status(
future_tolerance,
system_future_tolerance,
data_last_lag_seen,
data_last_ingestion_lag_seen,
data_last_time_seen,
data_last_ingest,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"'
)
# call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage
(
isUnderLatencyAlert,
isUnderLatencyMessage,
) = get_dsm_latency_status(
data_last_ingestion_lag_seen,
data_max_lag_allowed,
data_last_ingest,
data_last_time_seen,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage
(
isUnderDelayAlert,
isUnderDelayMessage,
) = get_dsm_delay_status(
data_last_lag_seen,
data_max_delay_allowed,
data_last_ingest,
data_last_time_seen,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# call set_dhm_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
splk_dhm_alerting_policy,
) = set_dhm_status(
logging,
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
record,
isOutlier,
isFuture,
isFutureMsg,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
isUnderLatencyAlert,
isUnderLatencyMessage,
isUnderDelayAlert,
isUnderDelayMessage,
default_splk_dhm_alerting_policy,
disruption_queue_collection,
disruption_queue_record,
source_handler="trackmedecisionmaker",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
new_record["object_state"] = object_state
new_record["status_message"] = " | ".join(status_message)
new_record["status_message_json"] = status_message_json
new_record["anomaly_reason"] = "|".join(anomaly_reason)
# future tolerance
try:
new_record["future_tolerance"] = int(
round(merged_future_tolerance, 0)
)
except:
new_record["future_tolerance"] = -600
# specific for dhm
new_record["splk_dhm_alerting_policy"] = (
splk_dhm_alerting_policy
)
# convert data_last_time_seen to last_time from epoch
last_time = convert_epoch_to_datetime(data_last_time_seen)
new_record["last_time"] = last_time
# convert data_last_ingest to last_ingest from epoch
last_ingest = convert_epoch_to_datetime(data_last_ingest)
new_record["last_ingest"] = last_ingest
# convert data_last_time_seen_idx to last_time_idx from epoch
last_time_idx = convert_epoch_to_datetime(data_last_time_seen)
new_record["last_time_idx"] = last_time_idx
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
new_record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-mhm
#
elif self.component == "mhm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get metric_details
metric_details = record.get("metric_details", None)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", metric_details="{metric_details}"'
)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# get metric_last_time_seen (epochtime)
metric_last_time_seen = record.get("metric_last_time_seen", 0)
try:
metric_last_time_seen = float(metric_last_time_seen)
except:
pass
# Get score data for this object_id (key_value) from scores_dict
score_data = scores_dict.get(key_value, {})
score = score_data.get("score", 0)
score_outliers = score_data.get("score_outliers", 0)
# call get_future_metrics_status and define isFuture
isFuture, isFutureMsg = get_future_metrics_status(
system_future_tolerance,
metric_last_time_seen,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", system_future_tolerance="{system_future_tolerance}", metric_last_time_seen="{metric_last_time_seen}", isFutureMsg="{isFutureMsg}"'
)
# call set_mhm_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_mhm_status(
logging,
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
record,
metric_details,
isFuture,
isFutureMsg,
object_logical_group_dict,
disruption_queue_collection,
disruption_queue_record,
source_handler="trackmedecisionmaker",
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
new_record["object_state"] = object_state
new_record["status_message"] = " | ".join(status_message)
new_record["status_message_json"] = status_message_json
new_record["anomaly_reason"] = "|".join(anomaly_reason)
# convert metric_last_time_seen to last_time from epoch
last_time = convert_epoch_to_datetime(metric_last_time_seen)
new_record["last_time"] = last_time
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
new_record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-flx
#
# get record fields depending on the component
elif self.component == "flx":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get outliers
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# Aggregate tracker-keyed JSON fields for concurrent trackers support
# Aggregate metrics: merge all trackers' metrics into a single dict
# This MUST be done BEFORE flx_check_dynamic_thresholds which expects aggregated metrics
if "metrics" in record:
try:
metrics_value = record.get("metrics")
if metrics_value:
metrics_by_tracker = None
# Parse if it's a JSON string
if isinstance(metrics_value, str):
try:
metrics_by_tracker = json.loads(metrics_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format, skip aggregation
pass
elif isinstance(metrics_value, dict):
metrics_by_tracker = metrics_value
if metrics_by_tracker and isinstance(metrics_by_tracker, dict):
# Check if it's tracker-keyed format (values are dicts) or old format (direct metrics dict)
aggregated_metrics = {}
is_tracker_keyed = False
for key, value in metrics_by_tracker.items():
if isinstance(value, dict):
# Check if value looks like metrics (has numeric/string values) or tracker data
# If all values in the nested dict are simple types, it's likely metrics
if all(isinstance(v, (int, float, str, bool)) or v is None for v in value.values()):
# This is tracker-keyed format, merge all trackers' metrics
aggregated_metrics.update(value)
is_tracker_keyed = True
else:
# Nested structure, might be tracker data
is_tracker_keyed = True
aggregated_metrics.update(value)
else:
# Simple value, old format
break
if is_tracker_keyed:
# Remove internal "status" field from aggregated metrics (not a user metric)
if "status" in aggregated_metrics:
del aggregated_metrics["status"]
# Update record with aggregated metrics as dict (for backward compatibility)
# Keep as dict since flx_check_dynamic_thresholds expects a dict
# Handle empty aggregated_metrics case (e.g., {"tracker1": {}})
record["metrics"] = aggregated_metrics
elif not is_tracker_keyed:
# Old format, keep as-is but ensure it's a dict and remove status field
if isinstance(metrics_value, str):
try:
old_metrics = json.loads(metrics_value)
if isinstance(old_metrics, dict) and "status" in old_metrics:
del old_metrics["status"]
record["metrics"] = old_metrics
except:
record["metrics"] = {}
else:
if isinstance(metrics_by_tracker, dict) and "status" in metrics_by_tracker:
metrics_by_tracker = metrics_by_tracker.copy()
del metrics_by_tracker["status"]
record["metrics"] = metrics_by_tracker
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'failed to aggregate metrics, exception="{str(e)}"'
)
# flx thresholds lookup
flx_thresholds_lookup(
object_value,
key_value,
record,
thresholds_collection_dict,
)
logging.debug(
f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"'
)
# flx check dynamic thresholds
threshold_alert, threshold_messages, threshold_scores = (
flx_check_dynamic_thresholds(
logging,
record.get("dynamic_thresholds", {}),
record.get("metrics", {}),
)
)
logging.debug(
f'instance_id={self.instance_id}, object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"'
)
# flx drilldown searches lookup
try:
flx_drilldown_searches_lookup(
self.tenant_id,
record.get("tracker_name", ""),
record.get("account", "local"),
record,
drilldown_searches_collection_dict,
)
logging.debug(
f'instance_id={self.instance_id}, drilldown_search="{record.get("drilldown_search", "")}", drilldown_search_earliest="{record.get("drilldown_search_earliest", "")}", drilldown_search_latest="{record.get("drilldown_search_latest", "")}", drilldown_searches="{json.dumps(record.get("drilldown_searches", []), indent=2)}"'
)
except Exception as e:
logging.error(f"instance_id={self.instance_id}, Error in flx_drilldown_searches_lookup: {str(e)}")
# flx default metrics lookup
try:
flx_default_metrics_lookup(
self.tenant_id,
record.get("tracker_name", ""),
record,
default_metrics_collection_dict,
)
logging.debug(
f'instance_id={self.instance_id}, default_metric="{record.get("default_metric", "")}"'
)
except Exception as e:
logging.error(f"instance_id={self.instance_id}, Error in flx_default_metrics_lookup: {str(e)}")
# Get score data for this object_id (key_value) from scores_dict
score_data = scores_dict.get(key_value, {})
score = score_data.get("score", 0)
score_outliers = score_data.get("score_outliers", 0)
# call get_outliers_status and define isOutlier (with hybrid scoring)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# Preserve tracker-keyed JSON for status, status_description and status_description_short
# We'll aggregate them temporarily for set_flx_status, then restore for proper merging in trackmepersistentfields
# IMPORTANT: The macro preserves status as status_preserved, but we need to check if it's tracker-keyed format
# If status_preserved exists and is tracker-keyed JSON, use it; otherwise check status field
status_tracker_keyed = None
status_desc_tracker_keyed = None
status_desc_short_tracker_keyed = None
# Check if macro preserved tracker-keyed format (status_preserved field)
# The macro preserves status before mvindex operation
if "status_preserved" in record:
status_preserved = record.get("status_preserved")
if isinstance(status_preserved, str):
try:
parsed = json.loads(status_preserved)
if isinstance(parsed, dict):
# It's tracker-keyed format from macro preservation
status_tracker_keyed = status_preserved
except (json.JSONDecodeError, TypeError):
pass
elif isinstance(status_preserved, dict):
status_tracker_keyed = json.dumps(status_preserved)
# If not found in preserved field, check status field directly
if not status_tracker_keyed and "status" in record:
status_raw = record.get("status")
# Check if it's already tracker-keyed format (JSON string or dict)
if isinstance(status_raw, str):
# Try to parse as JSON to verify it's tracker-keyed format
try:
parsed_status = json.loads(status_raw)
if isinstance(parsed_status, dict):
# It's tracker-keyed format, preserve it
status_tracker_keyed = status_raw
except (json.JSONDecodeError, TypeError):
# Not valid JSON, might be old format
pass
elif isinstance(status_raw, dict):
# Already a dict (tracker-keyed format)
status_tracker_keyed = json.dumps(status_raw)
# Check if macro preserved tracker-keyed format (status_description_preserved field)
if "status_description_preserved" in record:
status_desc_preserved = record.get("status_description_preserved")
if isinstance(status_desc_preserved, str):
try:
parsed = json.loads(status_desc_preserved)
if isinstance(parsed, dict):
# It's tracker-keyed format from macro preservation
status_desc_tracker_keyed = status_desc_preserved
except (json.JSONDecodeError, TypeError):
# Check if it contains " | " separator (already aggregated)
if " | " not in status_desc_preserved:
status_desc_tracker_keyed = status_desc_preserved
elif isinstance(status_desc_preserved, dict):
status_desc_tracker_keyed = json.dumps(status_desc_preserved)
# If not found in preserved field, check status_description field directly
if not status_desc_tracker_keyed and "status_description" in record:
status_desc_raw = record.get("status_description")
# Check if it's tracker-keyed format
if isinstance(status_desc_raw, str):
# Try to parse as JSON to verify it's tracker-keyed format
try:
parsed_desc = json.loads(status_desc_raw)
if isinstance(parsed_desc, dict):
# It's tracker-keyed format, preserve it
status_desc_tracker_keyed = status_desc_raw
except (json.JSONDecodeError, TypeError):
# Check if it contains " | " separator (already aggregated)
if " | " not in status_desc_raw:
# Might be old format single string
status_desc_tracker_keyed = status_desc_raw
elif isinstance(status_desc_raw, dict):
# Already a dict (tracker-keyed format)
status_desc_tracker_keyed = json.dumps(status_desc_raw)
# Check if macro preserved tracker-keyed format (status_description_short_preserved field)
if "status_description_short_preserved" in record:
status_desc_short_preserved = record.get("status_description_short_preserved")
if isinstance(status_desc_short_preserved, str):
try:
parsed = json.loads(status_desc_short_preserved)
if isinstance(parsed, dict):
# It's tracker-keyed format from macro preservation
status_desc_short_tracker_keyed = status_desc_short_preserved
except (json.JSONDecodeError, TypeError):
if " | " not in status_desc_short_preserved:
status_desc_short_tracker_keyed = status_desc_short_preserved
elif isinstance(status_desc_short_preserved, dict):
status_desc_short_tracker_keyed = json.dumps(status_desc_short_preserved)
# If not found in preserved field, check status_description_short field directly
if not status_desc_short_tracker_keyed and "status_description_short" in record:
status_desc_short_raw = record.get("status_description_short")
# Similar logic as status_description
if isinstance(status_desc_short_raw, str):
try:
parsed_desc_short = json.loads(status_desc_short_raw)
if isinstance(parsed_desc_short, dict):
status_desc_short_tracker_keyed = status_desc_short_raw
except (json.JSONDecodeError, TypeError):
if " | " not in status_desc_short_raw:
status_desc_short_tracker_keyed = status_desc_short_raw
elif isinstance(status_desc_short_raw, dict):
status_desc_short_tracker_keyed = json.dumps(status_desc_short_raw)
# Aggregate status temporarily for set_flx_status: worst-status logic (2 > 3 > 1)
if "status" in record:
try:
status_str = record.get("status")
if status_str:
aggregated_status = None
if isinstance(status_str, str):
try:
status_by_tracker = json.loads(status_str)
if isinstance(status_by_tracker, dict):
# Tracker-keyed format: apply worst-status logic
status_values = list(status_by_tracker.values())
if status_values:
# Worst-status logic: 2 (red) > 3 (orange) > 1 (green)
if 2 in status_values:
aggregated_status = 2 # Red
elif 3 in status_values:
aggregated_status = 3 # Orange
else:
aggregated_status = 1 # Green (all are 1)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format integer string
try:
aggregated_status = int(status_str)
except (ValueError, TypeError):
pass
elif isinstance(status_str, dict):
# Already a dict, apply worst-status logic
status_values = list(status_str.values())
if status_values:
if 2 in status_values:
aggregated_status = 2 # Red
elif 3 in status_values:
aggregated_status = 3 # Orange
else:
aggregated_status = 1 # Green
elif isinstance(status_str, int):
# Old format integer, use as-is
aggregated_status = status_str
# Temporarily update record with aggregated status for set_flx_status
if aggregated_status is not None:
record["status"] = aggregated_status
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'failed to aggregate status, exception="{str(e)}"'
)
# Determine number of trackers to decide if we need prefix
num_trackers = 1
if "tracker_name" in record:
try:
tracker_name_value = record.get("tracker_name")
if tracker_name_value:
if isinstance(tracker_name_value, str):
try:
tracker_names = json.loads(tracker_name_value)
if isinstance(tracker_names, list):
num_trackers = len(tracker_names)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be comma-separated string
if "," in tracker_name_value:
num_trackers = len([t.strip() for t in tracker_name_value.split(",")])
elif isinstance(tracker_name_value, list):
num_trackers = len(tracker_name_value)
except Exception:
pass
# Aggregate status_description temporarily for set_flx_status: concatenate all trackers' descriptions
if "status_description" in record:
try:
status_desc_str = record.get("status_description")
if status_desc_str:
if isinstance(status_desc_str, str):
try:
status_desc_by_tracker = json.loads(status_desc_str)
if isinstance(status_desc_by_tracker, dict):
# Check if it's tracker-keyed format
status_descriptions = []
for tracker_name, desc in status_desc_by_tracker.items():
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
status_descriptions.append(f"{tracker_name}: {desc}")
else:
status_descriptions.append(desc)
if status_descriptions:
# Temporarily update record with aggregated status_description for set_flx_status
record["status_description"] = " | ".join(status_descriptions)
else:
# Empty, keep as-is
pass
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format string, keep as-is
pass
elif isinstance(status_desc_str, dict):
# Already a dict, aggregate
status_descriptions = []
for tracker_name, desc in status_desc_str.items():
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
status_descriptions.append(f"{tracker_name}: {desc}")
else:
status_descriptions.append(desc)
if status_descriptions:
# Temporarily update record with aggregated status_description for set_flx_status
record["status_description"] = " | ".join(status_descriptions)
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'failed to aggregate status_description, exception="{str(e)}"'
)
# Aggregate status_description_short temporarily for set_flx_status: concatenate all trackers' descriptions
if "status_description_short" in record:
try:
status_desc_short_str = record.get("status_description_short")
if status_desc_short_str:
if isinstance(status_desc_short_str, str):
try:
status_desc_short_by_tracker = json.loads(status_desc_short_str)
if isinstance(status_desc_short_by_tracker, dict):
# Check if it's tracker-keyed format
status_descriptions_short = []
for tracker_name, desc in status_desc_short_by_tracker.items():
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
status_descriptions_short.append(f"{tracker_name}: {desc}")
else:
status_descriptions_short.append(desc)
if status_descriptions_short:
# Temporarily update record with aggregated status_description_short for set_flx_status
record["status_description_short"] = " | ".join(status_descriptions_short)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format string, keep as-is
pass
elif isinstance(status_desc_short_str, dict):
# Already a dict, aggregate
status_descriptions_short = []
for tracker_name, desc in status_desc_short_str.items():
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
status_descriptions_short.append(f"{tracker_name}: {desc}")
else:
status_descriptions_short.append(desc)
if status_descriptions_short:
# Temporarily update record with aggregated status_description_short for set_flx_status
record["status_description_short"] = " | ".join(status_descriptions_short)
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'failed to aggregate status_description_short, exception="{str(e)}"'
)
# Generate per-tracker status messages before calling set_flx_status
# This allows us to store individual messages per tracker in status_message_json
per_tracker_status_messages = []
# Only generate per-tracker messages if we have valid tracker-keyed data
# Both status and status_description must be tracker-keyed format (JSON strings that parse to dicts)
if status_tracker_keyed and status_desc_tracker_keyed:
try:
# Parse tracker-keyed status and status_description
status_by_tracker = None
status_desc_by_tracker = None
if isinstance(status_tracker_keyed, str):
try:
status_by_tracker = json.loads(status_tracker_keyed)
except (json.JSONDecodeError, TypeError):
pass
elif isinstance(status_tracker_keyed, dict):
status_by_tracker = status_tracker_keyed
if isinstance(status_desc_tracker_keyed, str):
try:
status_desc_by_tracker = json.loads(status_desc_tracker_keyed)
except (json.JSONDecodeError, TypeError):
pass
elif isinstance(status_desc_tracker_keyed, dict):
status_desc_by_tracker = status_desc_tracker_keyed
# Generate status message for each tracker
if isinstance(status_by_tracker, dict) and isinstance(status_desc_by_tracker, dict):
# Verify we have tracker-keyed data (dict with multiple keys)
if len(status_by_tracker) > 0 and len(status_desc_by_tracker) > 0:
# Sort tracker names for consistent ordering
sorted_tracker_names = sorted(status_by_tracker.keys())
for tracker_name in sorted_tracker_names:
tracker_status = status_by_tracker.get(tracker_name)
tracker_status_desc = status_desc_by_tracker.get(tracker_name, "unknown")
if tracker_status is None:
continue
# Skip if status_description contains " | " (already aggregated)
if isinstance(tracker_status_desc, str) and " | " in tracker_status_desc:
logging.warning(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'tracker="{tracker_name}" has aggregated status_description, skipping per-tracker message generation'
)
continue
try:
tracker_status_int = int(tracker_status)
except (ValueError, TypeError):
tracker_status_int = 1
# Generate status message for this tracker (same format as set_flx_status)
# Use only this tracker's description, not the aggregated one
# Only add prefix if multiple trackers
if num_trackers > 1:
status_desc_with_prefix = f"{tracker_name}: {tracker_status_desc}"
else:
status_desc_with_prefix = tracker_status_desc
if tracker_status_int == 1:
tracker_msg = f"The entity status is complying with monitoring rules (status: {tracker_status_int}, status_description: {status_desc_with_prefix})"
else:
tracker_msg = f"The entity status is not complying with monitoring rules (status: {tracker_status_int}, status_description: {status_desc_with_prefix})"
per_tracker_status_messages.append(tracker_msg)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'generated {len(per_tracker_status_messages)} per-tracker status messages from {len(sorted_tracker_names)} trackers'
)
else:
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'tracker-keyed data is empty, cannot generate per-tracker messages'
)
else:
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'tracker-keyed data is not in expected format: status_by_tracker={type(status_by_tracker)}, status_desc_by_tracker={type(status_desc_by_tracker)}'
)
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'failed to generate per-tracker status messages, exception="{str(e)}"'
)
# call set_flx_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_flx_status(
logging,
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
threshold_alert,
threshold_messages,
disruption_queue_collection,
disruption_queue_record,
source_handler="trackmedecisionmaker",
score=score,
score_outliers=score_outliers,
threshold_scores=threshold_scores,
vtenant_account=vtenant_conf,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# Replace status_message_json["status_message"] with per-tracker messages if available
# Otherwise keep the aggregated message from set_flx_status
if per_tracker_status_messages:
# Use per-tracker messages for better visibility
# Each tracker gets its own message in the array
status_message_json["status_message"] = per_tracker_status_messages
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'replaced status_message_json with {len(per_tracker_status_messages)} per-tracker messages'
)
else:
# If no per-tracker messages were generated (e.g., old format), keep the aggregated message
# This ensures backward compatibility
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
f'using aggregated status_message from set_flx_status (no per-tracker messages generated)'
)
# Restore tracker-keyed JSON for status, status_description and status_description_short
# This ensures proper merging in trackmepersistentfields
if status_tracker_keyed is not None:
record["status"] = status_tracker_keyed
if status_desc_tracker_keyed is not None:
record["status_description"] = status_desc_tracker_keyed
if status_desc_short_tracker_keyed is not None:
record["status_description_short"] = status_desc_short_tracker_keyed
# insert our main fields
new_record["object_state"] = object_state
new_record["status_message"] = " | ".join(status_message)
new_record["status_message_json"] = status_message_json
new_record["anomaly_reason"] = "|".join(anomaly_reason)
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
new_record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
# specific to flx, generate the status metric
try:
trackme_flx_gen_metrics(
record.get("_time", time.time()),
self.tenant_id,
object_value,
key_value,
metric_index,
json.dumps({"status": int(record.get("status", 1))}),
)
except Exception as e:
error_msg = f'instance_id={self.instance_id}, Failed to call trackme_flx_gen_metrics with exception="{str(e)}"'
logging.error(error_msg)
#
# splk-fqm
#
# get record fields depending on the component
elif self.component == "fqm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get outliers
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# fqm thresholds lookup
fqm_thresholds_lookup(
object_value,
key_value,
record,
thresholds_collection_dict,
)
logging.debug(
f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"'
)
# fqm check dynamic thresholds
threshold_alert, threshold_messages, threshold_scores = (
fqm_check_dynamic_thresholds(
logging,
record.get("dynamic_thresholds", {}),
record.get("metrics", {}),
)
)
logging.debug(
f'instance_id={self.instance_id}, object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"'
)
# Get score data for this object_id (key_value) from scores_dict
score_data = scores_dict.get(key_value, {})
score = score_data.get("score", 0)
score_outliers = score_data.get("score_outliers", 0)
# call get_outliers_status and define isOutlier (with hybrid scoring)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# call set_fqm_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_fqm_status(
logging,
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
threshold_alert,
threshold_messages,
disruption_queue_collection,
disruption_queue_record,
source_handler="trackmedecisionmaker",
score=score,
score_outliers=score_outliers,
threshold_scores=threshold_scores,
vtenant_account=vtenant_conf,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
new_record["object_state"] = object_state
new_record["status_message"] = " | ".join(status_message)
new_record["status_message_json"] = status_message_json
new_record["anomaly_reason"] = "|".join(anomaly_reason)
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
new_record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
# specific to fqm, generate the status metric
try:
trackme_fqm_gen_metrics(
record.get("_time", time.time()),
self.tenant_id,
object_value,
key_value,
metric_index,
json.dumps({"status": int(record.get("status", 1))}),
)
except Exception as e:
error_msg = f'instance_id={self.instance_id}, Failed to call trackme_fqm_gen_metrics with exception="{str(e)}"'
logging.error(error_msg)
#
# splk-wlk
#
# get record fields depending on the component
elif self.component == "wlk":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get outliers
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# Get score data for this object_id (key_value) from scores_dict
score_data = scores_dict.get(key_value, {})
score = score_data.get("score", 0)
score_outliers = score_data.get("score_outliers", 0)
# call get_outliers_status and define isOutlier (with hybrid scoring)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# call set_wlk_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_wlk_status(
logging,
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
disruption_queue_collection,
disruption_queue_record,
source_handler="trackmedecisionmaker",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logging.debug(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
new_record["object_state"] = object_state
new_record["status_message"] = " | ".join(status_message)
new_record["status_message_json"] = status_message_json
new_record["anomaly_reason"] = "|".join(anomaly_reason)
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
new_record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# End per component processing
#
except Exception as e:
logging.error(
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", component="{self.component}", Error processing record, record="{json.dumps(record, indent=2)}", exception="{str(e)}"'
)
continue # Proceed with next record
#
# End per component processing
#
if append_record:
# add all key value pairs from the original record to new_record if not present already
for key, value in record.items():
if key not in new_record:
new_record[key] = value
# add new_record to processed_records
processed_records.append(new_record)
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
#
# Render
#
# set task
#
task_start = time.time()
task_instance_id = get_uuid()
task_name = "render_records"
for yield_record in self.generate_fields(processed_records):
# logging
logging.debug(f'instance_id={self.instance_id}, yield_record="{json.dumps(yield_record, indent=2)}"')
# yield record
yield yield_record
# end task
#
task_end = time.time()
task_run_time = round((task_end - task_start), 3)
logging.info(
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
)
# performance counter
logging.info(
f'trackmedecisionmaker has terminated, tenant_id="{self.tenant_id}", component="{self.component}", instance_id="{self.instance_id}", upstream_records="{records_count}", processed_records="{len(processed_records)}", run_time="{round(time.time() - start, 3)}"'
)
dispatch(TrackMeDecisionMaker, sys.argv, sys.stdin, sys.stdout, __name__)