You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackme_rest_handler_compon...

3256 lines
152 KiB

#!/usr/bin/env python
# coding=utf-8
__name__ = "trackme_rest_handler_component.py"
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Built-in libraries
import json
import os
import sys
import time
import requests
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# set logging
from trackme_libs_logging import setup_logger
logger = setup_logger(
"trackme.rest.component_user", "trackme_rest_api_component_user.log"
)
# Redirect global logging to use the same handler
import logging
logging.getLogger().handlers = logger.handlers
logging.getLogger().setLevel(logger.level)
# import rest handler
import trackme_rest_handler
# import trackme libs
from trackme_libs import trackme_getloglevel, trackme_vtenant_account, trackme_reqinfo
# import trackme libs utils
from trackme_libs_utils import replace_encoded_backslashes, get_uuid
# import Splunk libs
import splunklib.client as client
# import TrackMe get data libs
from trackme_libs_get_data import (
search_kv_collection,
get_target_from_kv_collection,
get_sampling_kv_collection,
get_collection_documents_count,
get_wlk_apps_enablement_kv_collection,
get_feeds_datagen_kv_collection,
search_kv_collection_restmode,
search_kv_collection_searchmode,
search_kv_collection_sdkmode,
)
# import TrackMe decision maker libs
from trackme_libs_decisionmaker import (
pre_filter_records,
filter_records,
convert_epoch_to_datetime,
get_monitoring_time_status,
get_outliers_status,
get_data_sampling_status,
get_future_status,
get_future_metrics_status,
get_is_under_dcount_host,
get_logical_groups_collection_records,
get_dsm_latency_status,
get_dsm_delay_status,
set_dsm_status,
set_dhm_status,
set_mhm_status,
set_flx_status,
set_fqm_status,
set_wlk_status,
ack_check,
define_state_icon_code,
outliers_readiness,
logical_group_lookup,
set_feeds_lag_summary,
set_feeds_thresholds_duration,
dsm_sampling_lookup,
outliers_data_lookup,
sampling_anomaly_status,
get_coll_docs_ref,
docs_ref_lookup,
wlk_disabled_apps_lookup,
wlk_versioning_lookup,
wlk_orphan_lookup,
apply_blocklist,
dsm_check_default_thresholds,
dhm_check_default_thresholds,
dynamic_priority_lookup,
dynamic_tags_lookup,
dynamic_sla_class_lookup,
get_sla_timer,
flx_thresholds_lookup,
fqm_thresholds_lookup,
flx_check_dynamic_thresholds,
fqm_check_dynamic_thresholds,
flx_drilldown_searches_lookup,
flx_default_metrics_lookup,
calculate_score,
)
# import trackme libs disruption queue
from trackme_libs_disruption_queue import (
disruption_queue_lookup,
disruption_queue_update,
disruption_queue_get_duration,
)
# import chart generation functions from stateful alert helper
from modalert_trackme_stateful_alert_helper import (
get_chart_search,
get_mlmodels_from_kvstore,
flx_get_metrics_catalog_for_object_id,
fqm_get_metrics_catalog_for_object_id,
wlk_get_metrics_catalog_for_object_id,
remove_leading_spaces,
)
class TrackMeHandlerComponentRead_v2(trackme_rest_handler.RESTHandler):
def __init__(self, command_line, command_arg):
super(TrackMeHandlerComponentRead_v2, self).__init__(
command_line, command_arg, logger
)
def get_resource_group_desc_component(self, request_info, **kwargs):
response = {
"resource_group_name": "component",
"resource_group_desc": "Endpoints specific to TrackMe's components data offload (read only operations)",
}
return {"payload": response, "status": 200}
# Get the component data with pagination and progressive load capabilities
def get_load_component_data(self, request_info, **kwargs):
describe = False
try:
params_dict = request_info.raw_args["query_parameters"]
except Exception as e:
params_dict = None
try:
resp_dict = json.loads(str(request_info.raw_args["payload"]))
except Exception as e:
resp_dict = None
logger.info(
f'function get_load_component_data called, params_dict="{params_dict}"'
)
# Start performance counter
start = time.time()
if resp_dict is not None:
try:
describe = resp_dict["describe"]
if describe in ("true", "True"):
describe = True
except Exception as e:
describe = False
if not params_dict and not resp_dict:
describe = True
# if describe is requested, show the usage
if describe:
response = {
"describe": "This endpoint retrieves a TrackMe's component table data, it requires a GET call using params and the following options:",
"resource_desc": "Get TrackMe's component data",
"resource_spl_example": "| trackme url=\"/services/trackme/v2/component/load_component_data\" mode=\"get\" params=\"{'tenant_id': 'mytenant', 'component': 'flx', 'page': 1, 'size': 100}\"",
"options": [
{
"tenant_id": "(required) tenant identifier",
"component": "(required) component identifier, valid options are: flx, dsm, dhm, mhm, wlk, fqm",
"filter_object": "(optional) target a specific TrackMe object record, do not specify this for no filtering",
"filter_key": "(optional) target a specific TrackMe key record, do not specify this for no filtering",
"filter_objects": "(optional) comma-separated list of TrackMe object records to filter on, do not specify this for no filtering",
"filter_keys": "(optional) comma-separated list of TrackMe key records to filter on, do not specify this for no filtering",
"pagination_mode": "(optional) set to true to enable pagination, valid options are: local, remote. Defaults to remote.",
"page": "(optional) page number, specific the page to be retrieved, defaults to page 1",
"size": "(optional) number of records to retrieve, set to 0 with page: 1 to retrieve all records in a single operation",
"mode_view": "(optional) for splk-dhm/splk-mhm/splk-wlk, the view mode, defaults to minimal, valid options are: minimal, compact, full",
"load_charts_resources": "(optional) set to true to load the charts resources, defaults to false",
}
],
}
return {"payload": response, "status": 200}
if params_dict is not None:
# tenant_id
try:
tenant_id = params_dict["tenant_id"]
except Exception as e:
return {
"payload": {
"action": "failure",
"response": "the tenant_id is required",
},
"status": 400,
}
# component
try:
component = params_dict["component"]
except Exception as e:
return {
"payload": {
"action": "failure",
"response": "the component is required",
},
"status": 400,
}
# pagination_mode, optional and defaults to False if not specified
try:
pagination_mode = params_dict["pagination_mode"]
if pagination_mode not in ("local", "remote"):
return {
"payload": {
"action": "failure",
"response": "the pagination_mode is invalid",
},
"status": 400,
}
except Exception as e:
pagination_mode = "remote"
# filter_object, optional and defaults to None if not specified
try:
filter_object = params_dict["filter_object"]
except Exception as e:
filter_object = None
# filter_key, optional and defaults to None if not specified
try:
filter_key = params_dict["filter_key"]
except Exception as e:
filter_key = None
# filter_objects, optional and defaults to None if not specified
try:
filter_objects = params_dict["filter_objects"]
if filter_objects:
filter_objects = [obj.strip() for obj in filter_objects.split(",")]
except Exception as e:
filter_objects = None
# filter_keys, optional and defaults to None if not specified
try:
filter_keys = params_dict["filter_keys"]
if filter_keys:
filter_keys = [key.strip() for key in filter_keys.split(",")]
except Exception as e:
filter_keys = None
# page, if not submitted, default to 1
try:
page = int(params_dict["page"])
except Exception as e:
page = 1
# size, if not submitted, default to 0
try:
size = int(params_dict["size"])
except Exception as e:
size = 0
# mode_view
try:
mode_view = params_dict["mode_view"]
except Exception as e:
mode_view = "minimal"
logger.debug(f'mode_view="{mode_view}"')
# load_charts_resources (accepts boolean, true or false as strings, 0 or 1 as integers or strings)
try:
load_charts_resources = params_dict["load_charts_resources"]
if isinstance(load_charts_resources, str):
if load_charts_resources in ("true", "True", "1"):
load_charts_resources = True
elif load_charts_resources in ("false", "False", "0"):
load_charts_resources = False
elif isinstance(load_charts_resources, int):
if load_charts_resources == 1:
load_charts_resources = True
elif load_charts_resources == 0:
load_charts_resources = False
except Exception as e:
load_charts_resources = False
# Get splunkd port
splunkd_port = request_info.server_rest_port
# Get service
service = client.connect(
owner="nobody",
app="trackme",
port=splunkd_port,
token=request_info.session_key,
timeout=600,
)
# set loglevel
loglevel = trackme_getloglevel(
request_info.system_authtoken, request_info.server_rest_port
)
logger.setLevel(loglevel)
# set instance_id
self.instance_id = get_uuid()
# Get trackmeconf
trackme_conf = trackme_reqinfo(
request_info.system_authtoken, request_info.server_rest_uri
)["trackme_conf"]
# Get virtual tenant account
vtenant_conf = trackme_vtenant_account(
request_info.system_authtoken,
request_info.server_rest_uri,
tenant_id,
)
#
# System level settings
#
system_future_tolerance = float(
trackme_conf["splk_general"]["splk_general_feeds_future_tolerance"]
)
#
# System level default minimal disruption period
#
default_disruption_min_time_sec = int(
vtenant_conf["default_disruption_min_time_sec"]
)
#
# Tenant level default monitoring time policy
#
try:
default_monitoring_time_policy = vtenant_conf["monitoring_time_policy"]
except Exception as e:
default_monitoring_time_policy = "all_time"
#
# SLA timer
#
sla_classes = {}
sla_default_class = None
sla_classes = trackme_conf["sla"]["sla_classes"]
# try loading the JSON
try:
sla_classes = json.loads(sla_classes)
sla_default_class = trackme_conf["sla"]["sla_default_class"]
if not len(sla_default_class) > 0 or sla_default_class not in sla_classes:
sla_default_class = "silver"
logger.error(
f'instance_id={self.instance_id}, Invalid sla_default_class="{sla_default_class}", this SLA class is not part of the SLA classes, applying fallback configuration'
)
except:
logger.error(
f'instance_id={self.instance_id}, Error loading sla_classes JSON, please check the configuration, the JSON is not valid JSON, applying fallback configuration, exception="{str(e)}"'
)
sla_classes = json.loads(
'{"gold": {"sla_threshold": 14400, "rank": 3}, "silver": {"sla_threshold": 86400, "rank": 2}, "platinum": {"sla_threshold": 172800, "rank": 1}}'
)
sla_default_class = "silver"
# retrieve the score for the tenant and component
scores_dict = calculate_score(service, tenant_id, component)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", component="{component}", scores_dict="{json.dumps(scores_dict, indent=2)}"'
)
# dsm specific
if component == "dsm":
# docs references
docs_is_global = "False"
# doc_note_global
docs_note_global = trackme_conf["splk_general"][
"splk_general_dsm_docs_note_global"
]
if not docs_note_global:
docs_note_global = "N/A"
# docs_link_global
docs_link_global = trackme_conf["splk_general"][
"splk_general_dsm_docs_link_global"
]
if not docs_link_global:
docs_link_global = "N/A"
# both should be defined to be enabled
if docs_note_global == "N/A" or docs_link_global == "N/A":
docs_note_global = "N/A"
docs_link_global = "N/A"
else:
docs_is_global = "True"
# dhm specific
if component == "dhm":
macro_name = f"trackme_dhm_default_splk_dhm_alert_policy_tenant_{tenant_id}"
macro_current = service.confs["macros"][macro_name]
default_splk_dhm_alerting_policy = macro_current.content.get("definition")
# remove double quotes from default_splk_dhm_alerting_policy
default_splk_dhm_alerting_policy = default_splk_dhm_alerting_policy.replace(
'"', ""
)
logger.debug(
f'instance_id={self.instance_id}, default_splk_dhm_alerting_policy="{default_splk_dhm_alerting_policy}"'
)
#
# splk-flx specific collections
#
if component == "flx":
# Thresholds
thresholds_collection_name = f"kv_trackme_flx_thresholds_tenant_{tenant_id}"
thresholds_collection = service.kvstore[thresholds_collection_name]
(
thresholds_records,
thresholds_collection_keys,
thresholds_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, thresholds_collection_name, page=1, page_count=0, orderby="keyid"
)
logger.debug(
f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"'
)
# Drilldown searches
drilldown_searches_collection_name = f"kv_trackme_flx_drilldown_searches_tenant_{tenant_id}"
try:
drilldown_searches_collection = service.kvstore[drilldown_searches_collection_name]
(
drilldown_searches_records,
drilldown_searches_collection_keys,
drilldown_searches_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, drilldown_searches_collection_name, page=1, page_count=0, orderby="keyid"
)
except Exception as e:
logger.debug(f"instance_id={self.instance_id}, Drilldown searches collection not found or accessible: {str(e)}")
drilldown_searches_records = []
drilldown_searches_collection_keys = []
drilldown_searches_collection_dict = {}
logger.debug(
f'instance_id={self.instance_id}, drilldown_searches_collection_dict="{json.dumps(drilldown_searches_collection_dict, indent=2)}"'
)
# Default metrics
default_metrics_collection_name = f"kv_trackme_flx_default_metric_tenant_{tenant_id}"
try:
default_metrics_collection = service.kvstore[default_metrics_collection_name]
(
default_metrics_records,
default_metrics_collection_keys,
default_metrics_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, default_metrics_collection_name, page=1, page_count=0, orderby="keyid"
)
except Exception as e:
logger.debug(f"instance_id={self.instance_id}, Default metrics collection not found or accessible: {str(e)}")
default_metrics_records = []
default_metrics_collection_keys = []
default_metrics_collection_dict = {}
logger.debug(
f'instance_id={self.instance_id}, default_metrics_collection_dict="{json.dumps(default_metrics_collection_dict, indent=2)}"'
)
#
# splk-fqm specific collections
#
if component == "fqm":
# Thresholds
thresholds_collection_name = f"kv_trackme_fqm_thresholds_tenant_{tenant_id}"
thresholds_collection = service.kvstore[thresholds_collection_name]
(
thresholds_records,
thresholds_collection_keys,
thresholds_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, thresholds_collection_name, page=1, page_count=0, orderby="keyid"
)
logger.debug(
f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"'
)
#
# Virtual tenant account settings
#
# outliers tenant level settings (deprecated - kept for backward compatibility)
# These are no longer used with score-based approach, but kept for backward compatibility
tenant_outliers_set_state = int(vtenant_conf.get("outliers_set_state", 1))
tenant_data_sampling_set_state = int(vtenant_conf.get("data_sampling_set_state", 1))
#
# Logical groups collection records
#
logical_group_coll = service.kvstore[
f"kv_trackme_common_logical_group_tenant_{tenant_id}"
]
(
logical_coll_records,
logical_coll_dict,
logical_coll_members_list,
logical_coll_members_dict,
logical_coll_count,
) = get_logical_groups_collection_records(logical_group_coll)
# log debug
logger.debug(
f'instance_id={self.instance_id}, function get_logical_groups_collection_records, logical_coll_dict="{json.dumps(logical_coll_dict, indent=2)}", logical_coll_count="{logical_coll_count}"'
)
# entities KV collection
data_collection_name = f"kv_trackme_{component}_tenant_{tenant_id}"
data_collection = service.kvstore[data_collection_name]
# detect if we have multiple filters, if we do, set size to 0 as we need to retrieve all records
multiple_filters = False
query_parameters = request_info.raw_args["query_parameters"]
if params_dict:
# Loop through all query parameters
for key, value in query_parameters.items():
if "filter[" in key:
if key == "filter[1][field]":
multiple_filters = True
if multiple_filters:
size = 0
if (
not filter_object
and not filter_key
and not filter_objects
and not filter_keys
):
# get records
if size == 0:
func_start = time.time()
data_records, data_collection_keys, data_collection_dict, last_page = (
search_kv_collection(
service,
data_collection_name,
page=1,
page_count=0,
)
)
last_page = 1
logger.info(
f"instance_id={self.instance_id}, function search_kv_collection took {round(time.time() - func_start, 2)} seconds, records_count={len(data_records)}"
)
else:
func_start = time.time()
data_records, data_collection_keys, data_collection_dict, last_page = (
search_kv_collection(
service,
data_collection_name,
page=page,
page_count=size,
)
)
logger.info(
f"instance_id={self.instance_id}, function search_kv_collection took {round(time.time() - func_start, 2)} seconds, records_count={len(data_records)}"
)
elif filter_object: # filter on a given object
data_records, data_collection_keys, data_collection_dict = (
get_target_from_kv_collection(
"object", filter_object, data_collection, data_collection_name
)
)
last_page = 1
total_record_count = len(data_records)
elif filter_key: # filter on a given key
data_records, data_collection_keys, data_collection_dict = (
get_target_from_kv_collection(
"_key", filter_key, data_collection, data_collection_name
)
)
last_page = 1
total_record_count = len(data_records)
elif filter_objects: # filter on multiple objects
data_records, data_collection_keys, data_collection_dict = (
get_target_from_kv_collection(
"object", filter_objects, data_collection, data_collection_name
)
)
last_page = 1
total_record_count = len(data_records)
elif filter_keys: # filter on multiple keys
data_records, data_collection_keys, data_collection_dict = (
get_target_from_kv_collection(
"_key", filter_keys, data_collection, data_collection_name
)
)
last_page = 1
total_record_count = len(data_records)
# for later usage
total_record_count = len(data_records)
# get Ack collection
ack_collection_name = f"kv_trackme_common_alerts_ack_tenant_{tenant_id}"
ack_collection = service.kvstore[ack_collection_name]
(
ack_records,
ack_collection_keys,
ack_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, ack_collection_name, page=1, page_count=0, orderby="object"
)
# get priority collection
priority_collection_name = f"kv_trackme_{component}_priority_tenant_{tenant_id}"
priority_collection = service.kvstore[priority_collection_name]
(
priority_records,
priority_collection_keys,
priority_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, priority_collection_name, page=1, page_count=0, orderby="keyid"
)
# get tags collection
tags_collection_name = f"kv_trackme_{component}_tags_tenant_{tenant_id}"
tags_collection = service.kvstore[tags_collection_name]
(
tags_records,
tags_collection_keys,
tags_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, tags_collection_name, page=1, page_count=0, orderby="keyid"
)
# get sla collection
sla_collection_name = f"kv_trackme_{component}_sla_tenant_{tenant_id}"
sla_collection = service.kvstore[sla_collection_name]
(
sla_records,
sla_collection_keys,
sla_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, sla_collection_name, page=1, page_count=0, orderby="keyid"
)
# get disruption queue collection
disruption_queue_collection_name = (
f"kv_trackme_common_disruption_queue_tenant_{tenant_id}"
)
disruption_queue_collection = service.kvstore[disruption_queue_collection_name]
(
disruption_queue_records,
disruption_queue_collection_keys,
disruption_queue_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, disruption_queue_collection_name, page=1, page_count=0, orderby="keyid"
)
logger.debug(
f'instance_id={self.instance_id}, disruption_queue_collection_dict="{json.dumps(disruption_queue_collection_dict, indent=2)}"'
)
# get outliers data (all components except mhm)
if component not in ["mhm"]:
# data collection
outliers_data_collection_name = (
f"kv_trackme_{component}_outliers_entity_data_tenant_{tenant_id}"
)
outliers_data_collection = service.kvstore[outliers_data_collection_name]
(
outliers_data_records,
outliers_data_collection_keys,
outliers_data_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, outliers_data_collection_name, page=1, page_count=0, orderby="keyid"
)
# rules collection
outliers_rules_collection_name = (
f"kv_trackme_{component}_outliers_entity_rules_tenant_{tenant_id}"
)
outliers_rules_collection = service.kvstore[outliers_rules_collection_name]
(
outliers_rules_records,
outliers_rules_collection_keys,
outliers_rules_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, outliers_rules_collection_name, page=1, page_count=0, orderby="keyid"
)
#
# component specific collections
#
if component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk"]:
# datagen
datagen_collection_name = (
f"kv_trackme_{component}_allowlist_tenant_{tenant_id}"
)
datagen_collection = service.kvstore[datagen_collection_name]
(
datagen_records,
datagen_collection_keys,
datagen_collection_dict,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
) = get_feeds_datagen_kv_collection(
datagen_collection, datagen_collection_name, component
)
logger.debug(
f'instance_id={self.instance_id}, datagen_collection_dict="{json.dumps(datagen_collection_dict, indent=2)}"'
)
logger.debug(
f'instance_id={self.instance_id}, datagen_collection_blocklist_not_regex_dict="{json.dumps(datagen_collection_blocklist_not_regex_dict, indent=2)}"'
)
logger.debug(
f'instance_id={self.instance_id}, datagen_collection_blocklist_regex_dict="{json.dumps(datagen_collection_blocklist_regex_dict, indent=2)}"'
)
#
# splk-dsm specific collections
#
if component == "dsm":
# Data sampling
sampling_collection_name = (
f"kv_trackme_dsm_data_sampling_tenant_{tenant_id}"
)
sampling_collection = service.kvstore[sampling_collection_name]
sampling_records, sampling_collection_keys, sampling_collection_dict = (
get_sampling_kv_collection(
sampling_collection, sampling_collection_name
)
)
# Docs reference
docs_collection_name = f"kv_trackme_dsm_knowledge_tenant_{tenant_id}"
docs_collection = service.kvstore[docs_collection_name]
(
docs_collection_records,
docs_collection_records_dict,
docs_collection_members_list,
docs_collection_members_dict,
) = get_coll_docs_ref(docs_collection, docs_collection_name)
logger.debug(
f'instance_id={self.instance_id}, docs_collection_dict="{json.dumps(docs_collection_members_dict, indent=2)}"'
)
#
# splk-wlk specific collections
#
if component == "wlk":
# apps_disabled
apps_enablement_collection_name = (
f"kv_trackme_wlk_apps_enablement_tenant_{tenant_id}"
)
apps_enablement_collection = service.kvstore[
apps_enablement_collection_name
]
(
apps_enablement_records,
apps_enablement_collection_keys,
apps_enablement_collection_dict,
) = get_wlk_apps_enablement_kv_collection(
apps_enablement_collection, apps_enablement_collection_name
)
logger.debug(
f'instance_id={self.instance_id}, apps_enablement_collection_dict="{json.dumps(apps_enablement_collection_dict, indent=2)}"'
)
# versioning
versioning_collection_name = f"kv_trackme_wlk_versioning_tenant_{tenant_id}"
versioning_collection = service.kvstore[versioning_collection_name]
(
versioning_records,
versioning_collection_keys,
versioning_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, versioning_collection_name, page=1, page_count=0, orderby="keyid"
)
logger.debug(
f'instance_id={self.instance_id}, versioning_collection_dict="{json.dumps(versioning_collection_dict, indent=2)}"'
)
# orphan
orphan_collection_name = f"kv_trackme_wlk_orphan_status_tenant_{tenant_id}"
orphan_collection = service.kvstore[orphan_collection_name]
(
orphan_records,
orphan_collection_keys,
orphan_collection_dict,
last_page,
) = search_kv_collection_sdkmode(
logger, service, orphan_collection_name, page=1, page_count=0, orderby="keyid"
)
logger.debug(
f'instance_id={self.instance_id}, orphan_collection_dict="{json.dumps(orphan_collection_dict, indent=2)}"'
)
# A list to store processed records
processed_records = []
# Process records through TrackMe's decision maker workflow
records_count = 0
# filter records - server side filters not working for now
query_parameters_json = request_info.raw_args["query_parameters"]
logger.info(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", component="{component}", received query_parameters_json="{json.dumps(query_parameters_json, indent=2)}"'
)
# pre-filtered records
prefiltered_records = pre_filter_records(data_records, query_parameters_json)
# loop
for record in prefiltered_records:
records_count += 1
try:
logger.debug(f"instance_id={self.instance_id}, processing record")
# append_record boolean, True by default unless specific use cases
append_record = True
# get object_value and key
object_value = record.get("object", None)
logger.debug(
f'instance_id={self.instance_id}, object="{object_value}", record="{json.dumps(record, indent=2)}"'
)
# save the current value of object_state in the record as kvcurrent_object_state, we manipulate real state calculations
# and we need the original state in some conditions (sla)
record["kvcurrent_object_state"] = record.get("object_state", "N/A")
# get the KVsotre unique key and add to the record as keyid
key_value = record.get("_key", None)
record["keyid"] = key_value
# get the score for the object and add to the record
try:
score = int(scores_dict.get(key_value, {}).get("score", 0))
except:
score = 0
try:
score_outliers = int(scores_dict.get(key_value, {}).get("score_outliers", 0))
except:
score_outliers = 0
record["score_outliers"] = score_outliers
try:
score_source = scores_dict.get(key_value, {}).get("score_source", [])
except:
score_source = []
record["score"] = score
record["score_source"] = score_source
# ensure alias has not encoded backslashes
record["alias"] = replace_encoded_backslashes(record.get("alias", ""))
#
# logical group lookup
#
if component not in ["wlk"]:
logical_group_lookup(
object_value,
logical_coll_members_list,
logical_coll_members_dict,
record,
)
#
# some safety checks for feeds (dsm/dhm)
#
if component in ["dsm"]:
dsm_check_default_thresholds(record, trackme_conf)
elif component in ["dhm"]:
dhm_check_default_thresholds(record, trackme_conf)
#
# Check Ack
#
# Call ack_check function
ack_check(
object_value,
ack_collection_keys,
ack_collection_dict,
record,
)
#
# Dynamic priority
#
dynamic_priority_lookup(
key_value,
priority_collection_keys,
priority_collection_dict,
record,
)
#
# Dynamic tags
#
dynamic_tags_lookup(
key_value,
tags_collection_keys,
tags_collection_dict,
record,
)
#
# Dynamic sla_class
#
dynamic_sla_class_lookup(
key_value,
sla_collection_keys,
sla_collection_dict,
record,
)
#
# Disruption queue
#
# Aggregate disruption_min_time_sec: take maximum value across all trackers
aggregated_disruption_min_time_sec = default_disruption_min_time_sec
if "disruption_min_time_sec" in record:
try:
disruption_min_time_value = record.get("disruption_min_time_sec")
if disruption_min_time_value:
disruption_times_by_tracker = None
# Parse if it's a JSON string
if isinstance(disruption_min_time_value, str):
try:
disruption_times_by_tracker = json.loads(disruption_min_time_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format numeric value
try:
aggregated_disruption_min_time_sec = max(
default_disruption_min_time_sec,
int(float(disruption_min_time_value))
)
except (ValueError, TypeError):
pass
elif isinstance(disruption_min_time_value, dict):
disruption_times_by_tracker = disruption_min_time_value
else:
# Numeric value (old format)
try:
aggregated_disruption_min_time_sec = max(
default_disruption_min_time_sec,
int(float(disruption_min_time_value))
)
except (ValueError, TypeError):
pass
# If tracker-keyed format, take maximum across all trackers
if disruption_times_by_tracker and isinstance(disruption_times_by_tracker, dict):
max_disruption_time = max(
int(float(v)) for v in disruption_times_by_tracker.values()
)
aggregated_disruption_min_time_sec = max(
default_disruption_min_time_sec,
max_disruption_time
)
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, failed to aggregate disruption_min_time_sec for object="{object_value}", '
f'exception="{str(e)}"'
)
disruption_queue_record = disruption_queue_lookup(
key_value,
disruption_queue_collection_keys,
disruption_queue_collection_dict,
aggregated_disruption_min_time_sec,
)
#
# Outliers status (all components except mhm)
#
if component not in ["mhm"]:
outliers_data_lookup(
key_value,
outliers_data_collection_keys,
outliers_data_collection_dict,
outliers_rules_collection_keys,
outliers_rules_collection_dict,
record,
)
#
# Outliers readiness
#
outliers_readiness(record)
#
# Human time fields context
#
record["latest_flip_time (translated)"] = convert_epoch_to_datetime(
record.get("latest_flip_time", "0")
)
record["tracker_runtime (translated)"] = convert_epoch_to_datetime(
record.get("tracker_runtime", "0")
)
#
# tags field, if not existing in record, set to "N/A"
#
tags_auto = record.get("tags_auto", [])
tags_manual = record.get("tags_manual", [])
if tags_auto:
# if tags_auto is a string, convert to a list
if isinstance(tags_auto, str):
tags_auto = tags_auto.split(",")
else:
tags_auto = []
# add to record
record["tags_auto"] = tags_auto
if tags_manual:
# if tags_manual is a string, convert to a list
if isinstance(tags_manual, str):
tags_manual = tags_manual.split(",")
else:
tags_manual = []
# add to record
record["tags_manual"] = tags_manual
# merge tags_auto and tags_manual into tags
tags = sorted(
list(set([x.lower() for x in tags_auto + tags_manual if x]))
)
# finally, set the tags field if not existing
if not tags:
record["tags"] = "N/A"
else:
record["tags"] = tags
#
# splk-dsm
#
# get record fields depending on the component
if component == "dsm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# refresh data_last_lag_seen in the record
try:
record["data_last_lag_seen"] = time.time() - float(
record.get("data_last_time_seen", 0)
)
except:
record["data_last_lag_seen"] = 0
# get outliers and data sampling
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
try:
isAnomaly = int(record.get("isAnomaly", 0))
except:
isAnomaly = 0
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"'
)
# get future_tolerance
future_tolerance = record.get("future_tolerance", 0)
try:
future_tolerance = float(future_tolerance)
except:
future_tolerance = 0
# get actual primary KPI values
data_last_ingestion_lag_seen = record.get(
"data_last_ingestion_lag_seen", 0
)
if data_last_ingestion_lag_seen == "":
data_last_ingestion_lag_seen = 0
try:
data_last_ingestion_lag_seen = float(
data_last_ingestion_lag_seen
)
except:
data_last_ingestion_lag_seen = 0
data_last_lag_seen = record.get("data_last_lag_seen", 0)
# get per entity thresholds
data_max_lag_allowed = float(
record.get("data_max_lag_allowed", 0)
)
data_max_delay_allowed = float(
record.get("data_max_delay_allowed", 0)
)
min_dcount_threshold = record.get("min_dcount_threshold", 0)
try:
min_dcount_threshold = float(min_dcount_threshold)
except:
min_dcount_threshold = 0
# get dcount host related information
min_dcount_host = record.get("min_dcount_host", "any")
try:
min_dcount_host = float(min_dcount_host)
except:
pass
min_dcount_field = record.get("min_dcount_field", None)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime)
data_last_ingest = record.get("data_last_ingest", 0)
try:
data_last_ingest = float(data_last_ingest)
except:
pass
data_last_time_seen = record.get("data_last_time_seen", 0)
if data_last_time_seen == "":
data_last_time_seen = 0
try:
data_last_time_seen = float(data_last_time_seen)
except:
data_last_time_seen = 0
data_last_time_seen_idx = record.get(
"data_last_time_seen_idx", 0
)
try:
data_last_time_seen_idx = float(data_last_time_seen_idx)
except:
pass
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# call get_outliers_status and define isOutlier (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 921-933)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
#
# DSM Sampling
#
# call function dsm_sampling_lookup
dsm_sampling_lookup(
object_value,
sampling_collection_keys,
sampling_collection_dict,
record,
)
# call get_data_sampling_status and define isAnomaly
isAnomaly = get_data_sampling_status(
record.get("data_sample_status_colour"),
record.get("data_sample_feature"),
tenant_data_sampling_set_state,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isAnomaly="{isAnomaly}", tenant_data_sampling_set_state="{tenant_data_sampling_set_state}"'
)
# call get_future_status and define isFuture
(
isFuture,
isFutureMsg,
merged_future_tolerance,
) = get_future_status(
future_tolerance,
system_future_tolerance,
data_last_lag_seen,
data_last_ingestion_lag_seen,
data_last_time_seen,
data_last_ingest,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"'
)
# call get_is_under_dcount_host and define isUnderDcountHost
(
isUnderDcountHost,
isUnderDcountHostMsg,
) = get_is_under_dcount_host(
min_dcount_host, min_dcount_threshold, min_dcount_field
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDcountHost="{isUnderDcountHost}", isUnderDcountHostMsg="{isUnderDcountHostMsg}", min_dcount_host="{min_dcount_host}", min_dcount_threshold="{min_dcount_threshold}"'
)
# call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage
(
isUnderLatencyAlert,
isUnderLatencyMessage,
) = get_dsm_latency_status(
data_last_ingestion_lag_seen,
data_max_lag_allowed,
data_last_ingest,
data_last_time_seen,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage
(
isUnderDelayAlert,
isUnderDelayMessage,
) = get_dsm_delay_status(
data_last_lag_seen,
data_max_delay_allowed,
data_last_ingest,
data_last_time_seen,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# Initialize threshold_scores for DSM (DSM doesn't use dynamic thresholds, so this is always empty)
threshold_scores = []
# call set_dsm_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_dsm_status(
logger,
request_info.server_rest_uri,
request_info.system_authtoken,
tenant_id,
record,
isOutlier,
isAnomaly,
isFuture,
isFutureMsg,
isUnderMonitoring,
isUnderMonitoringMsg,
isUnderDcountHost,
isUnderDcountHostMsg,
object_logical_group_dict,
isUnderLatencyAlert,
isUnderLatencyMessage,
isUnderDelayAlert,
isUnderDelayMessage,
disruption_queue_collection,
disruption_queue_record,
source_handler="rest_handler",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logger.debug(
f'instance_id={self.instance_id}, set_dsm_status, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
record["object_state"] = object_state
record["status_message"] = " | ".join(status_message)
record["status_message_json"] = status_message_json
record["anomaly_reason"] = "|".join(anomaly_reason)
# generate charts resources for this entity
if load_charts_resources:
try:
charts_resources = generate_charts_resources(
tenant_id=tenant_id,
component="dsm",
object=object_value,
keyid=key_value,
anomaly_reason=anomaly_reason,
vtenant_conf=vtenant_conf,
service=service
)
record["charts_resources"] = charts_resources
except Exception as e:
logger.debug(f"Failed to generate charts for DSM entity {key_value}: {str(e)}")
record["charts_resources"] = []
# sampling status
sampling_anomaly_status(record)
# future tolerance
try:
record["future_tolerance"] = int(
round(merged_future_tolerance, 0)
)
except:
record["future_tolerance"] = -600
# convert data_last_time_seen to last_time from epoch
last_time = convert_epoch_to_datetime(data_last_time_seen)
record["last_time"] = last_time
# convert data_last_ingest to last_ingest from epoch
last_ingest = convert_epoch_to_datetime(data_last_ingest)
record["last_ingest"] = last_ingest
# convert data_last_time_seen_idx to last_time_idx from epoch
last_time_idx = convert_epoch_to_datetime(data_last_time_seen)
record["last_time_idx"] = last_time_idx
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
record["latest_flip_time_human"] = convert_epoch_to_datetime(
latest_flip_time_human
)
# set lag_summary field
record["lag_summary"] = set_feeds_lag_summary(record, component)
# get and set thresholds_duration
(
data_max_delay_allowed_duration,
data_max_lag_allowed_duration,
) = set_feeds_thresholds_duration(record)
record["data_max_delay_allowed_duration"] = (
data_max_delay_allowed_duration
)
record["data_max_lag_allowed_duration"] = (
data_max_lag_allowed_duration
)
# Documentation note
docs_ref_lookup(
docs_is_global,
docs_note_global,
docs_link_global,
object_value,
docs_collection_members_list,
docs_collection_members_dict,
record,
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-dhm
#
elif component == "dhm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# refresh data_last_lag_seen in the record
try:
record["data_last_lag_seen"] = time.time() - float(
record.get("data_last_time_seen", 0)
)
except:
record["data_last_lag_seen"] = 0
# get splk_dhm_st_summary
splk_dhm_st_summary = record.get("splk_dhm_st_summary", None)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", splk_dhm_st_summary="{splk_dhm_st_summary}"'
)
# get outliers and data sampling
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
try:
isAnomaly = int(record.get("isAnomaly", 0))
except:
isAnomaly = 0
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"'
)
# get future_tolerance
future_tolerance = record.get("future_tolerance", 0)
try:
future_tolerance = float(future_tolerance)
except:
future_tolerance = 0
# get actual primary KPI values
data_last_ingestion_lag_seen = record.get(
"data_last_ingestion_lag_seen", 0
)
if data_last_ingestion_lag_seen == "":
data_last_ingestion_lag_seen = 0
try:
data_last_ingestion_lag_seen = float(
data_last_ingestion_lag_seen
)
except:
data_last_ingestion_lag_seen = 0
data_last_lag_seen = record.get("data_last_lag_seen", 0)
# get per entity thresholds
data_max_lag_allowed = float(
record.get("data_max_lag_allowed", 0)
)
data_max_delay_allowed = float(
record.get("data_max_delay_allowed", 0)
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime)
data_last_ingest = record.get("data_last_ingest", 0)
try:
data_last_ingest = float(data_last_ingest)
except:
pass
data_last_time_seen = record.get("data_last_time_seen", 0)
if data_last_time_seen == "":
data_last_time_seen = 0
try:
data_last_time_seen = float(data_last_time_seen)
except:
data_last_time_seen = 0
data_last_time_seen_idx = record.get(
"data_last_time_seen_idx", 0
)
try:
data_last_time_seen_idx = float(data_last_time_seen_idx)
except:
pass
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# call get_outliers_status and define isOutlier (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 920-923)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# call get_future_status and define isFuture
(
isFuture,
isFutureMsg,
merged_future_tolerance,
) = get_future_status(
future_tolerance,
system_future_tolerance,
data_last_lag_seen,
data_last_ingestion_lag_seen,
data_last_time_seen,
data_last_ingest,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"'
)
# call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage
(
isUnderLatencyAlert,
isUnderLatencyMessage,
) = get_dsm_latency_status(
data_last_ingestion_lag_seen,
data_max_lag_allowed,
data_last_ingest,
data_last_time_seen,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage
(
isUnderDelayAlert,
isUnderDelayMessage,
) = get_dsm_delay_status(
data_last_lag_seen,
data_max_delay_allowed,
data_last_ingest,
data_last_time_seen,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
)
# Initialize threshold_scores for DHM (DHM doesn't use dynamic thresholds, so this is always empty)
threshold_scores = []
# call set_dhm_status and define object_state and anomaly_reason (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 921-933)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
splk_dhm_alerting_policy,
) = set_dhm_status(
logger,
request_info.server_rest_uri,
request_info.system_authtoken,
tenant_id,
record,
isOutlier,
isFuture,
isFutureMsg,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
isUnderLatencyAlert,
isUnderLatencyMessage,
isUnderDelayAlert,
isUnderDelayMessage,
default_splk_dhm_alerting_policy,
disruption_queue_collection,
disruption_queue_record,
source_handler="rest_handler",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
record["object_state"] = object_state
record["status_message"] = " | ".join(status_message)
record["status_message_json"] = status_message_json
record["anomaly_reason"] = "|".join(anomaly_reason)
# generate charts resources for this entity
if load_charts_resources:
try:
charts_resources = generate_charts_resources(
tenant_id=tenant_id,
component="dhm",
object=object_value,
keyid=key_value,
anomaly_reason=anomaly_reason,
vtenant_conf=vtenant_conf,
service=service
)
record["charts_resources"] = charts_resources
except Exception as e:
logger.debug(f"Failed to generate charts for DHM entity {key_value}: {str(e)}")
record["charts_resources"] = []
# future tolerance
try:
record["future_tolerance"] = int(
round(merged_future_tolerance, 0)
)
except:
record["future_tolerance"] = -600
# specific for dhm
record["splk_dhm_alerting_policy"] = splk_dhm_alerting_policy
# convert data_last_time_seen to last_time from epoch
last_time = convert_epoch_to_datetime(data_last_time_seen)
record["last_time"] = last_time
# convert data_last_ingest to last_ingest from epoch
last_ingest = convert_epoch_to_datetime(data_last_ingest)
record["last_ingest"] = last_ingest
# convert data_last_time_seen_idx to last_time_idx from epoch
last_time_idx = convert_epoch_to_datetime(data_last_time_seen)
record["last_time_idx"] = last_time_idx
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
record["latest_flip_time_human"] = convert_epoch_to_datetime(
latest_flip_time_human
)
# set lag_summary field
record["lag_summary"] = set_feeds_lag_summary(record, component)
# get and set thresholds_duration
(
data_max_delay_allowed_duration,
data_max_lag_allowed_duration,
) = set_feeds_thresholds_duration(record)
record["data_max_delay_allowed_duration"] = (
data_max_delay_allowed_duration
)
record["data_max_lag_allowed_duration"] = (
data_max_lag_allowed_duration
)
# sourcetype summary
record["sourcetype_summary"] = record.get(
f"splk_dhm_st_summary_{mode_view}", "{}"
)
del record["splk_dhm_st_summary_minimal"]
del record["splk_dhm_st_summary_compact"]
# splk_dhm_st_summary_full is needed for UI expansion purposes
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-mhm
#
elif component == "mhm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# refresh data_last_lag_seen in the record
try:
record["last_lag_seen"] = time.time() - float(
record.get("metric_last_time_seen", 0)
)
except:
record["last_lag_seen"] = 0
# get metric_details
metric_details = record.get("metric_details", None)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", metric_details="{metric_details}"'
)
# metric_details summary replacements
record["metric_details"] = record.get(
f"metric_details_{mode_view}", "{}"
)
# remove metric_details_* for optimization purposes
del record["metric_details_minimal"]
del record["metric_details_compact"]
# metric_details_full cannot be removed for UI expansion purposes
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# get metric_last_time_seen (epochtime)
metric_last_time_seen = record.get("metric_last_time_seen", 0)
try:
metric_last_time_seen = float(metric_last_time_seen)
except:
pass
# call get_future_metrics_status and define isFuture
isFuture, isFutureMsg = get_future_metrics_status(
system_future_tolerance,
metric_last_time_seen,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", system_future_tolerance="{system_future_tolerance}", metric_last_time_seen="{metric_last_time_seen}", isFutureMsg="{isFutureMsg}"'
)
# call set_mhm_status and define object_state and anomaly_reason (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 921-933)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_mhm_status(
logger,
request_info.server_rest_uri,
request_info.system_authtoken,
tenant_id,
record,
metric_details,
isFuture,
isFutureMsg,
object_logical_group_dict,
disruption_queue_collection,
disruption_queue_record,
source_handler="rest_handler",
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
record["object_state"] = object_state
record["status_message"] = " | ".join(status_message)
record["status_message_json"] = status_message_json
record["anomaly_reason"] = "|".join(anomaly_reason)
# generate charts resources for this entity
if load_charts_resources:
try:
charts_resources = generate_charts_resources(
tenant_id=tenant_id,
component="mhm",
object=object_value,
keyid=key_value,
anomaly_reason=anomaly_reason,
vtenant_conf=vtenant_conf,
service=service
)
record["charts_resources"] = charts_resources
except Exception as e:
logger.debug(f"Failed to generate charts for MHM entity {key_value}: {str(e)}")
record["charts_resources"] = []
# convert metric_last_time_seen to last_time from epoch
last_time = convert_epoch_to_datetime(metric_last_time_seen)
record["last_time"] = last_time
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
record["latest_flip_time_human"] = convert_epoch_to_datetime(
latest_flip_time_human
)
# set lag_summary field
record["lag_summary"] = set_feeds_lag_summary(record, component)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-flx
#
# get record fields depending on the component
elif component == "flx":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get outliers
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# call get_outliers_status and define isOutlier (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 920-923)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# Aggregate tracker-keyed JSON fields for concurrent trackers support (same logic as decision maker)
# Aggregate metrics: merge all trackers' metrics into a single dict
# This must be done BEFORE flx_check_dynamic_thresholds which expects aggregated metrics
if "metrics" in record:
try:
metrics_value = record.get("metrics")
if metrics_value:
metrics_by_tracker = None
# Parse if it's a JSON string
if isinstance(metrics_value, str):
try:
metrics_by_tracker = json.loads(metrics_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format, skip aggregation
pass
elif isinstance(metrics_value, dict):
metrics_by_tracker = metrics_value
if metrics_by_tracker and isinstance(metrics_by_tracker, dict):
# Check if it's tracker-keyed format (values are dicts) or old format (direct metrics dict)
aggregated_metrics = {}
is_tracker_keyed = False
for key, value in metrics_by_tracker.items():
if isinstance(value, dict):
# Check if value looks like metrics (has numeric/string values) or tracker data
# If all values in the nested dict are simple types, it's likely metrics
if all(isinstance(v, (int, float, str, bool)) or v is None for v in value.values()):
# This is tracker-keyed format, merge all trackers' metrics
aggregated_metrics.update(value)
is_tracker_keyed = True
else:
# Nested structure, might be tracker data
is_tracker_keyed = True
aggregated_metrics.update(value)
else:
# Simple value, old format
break
if is_tracker_keyed:
# Remove internal "status" field from aggregated metrics (not a user metric)
if "status" in aggregated_metrics:
del aggregated_metrics["status"]
# Update record with aggregated metrics as dict (for backward compatibility)
# Handle empty aggregated_metrics case (e.g., {"tracker1": {}})
record["metrics"] = aggregated_metrics
elif not is_tracker_keyed:
# Old format (already aggregated flat dict), remove status field
if isinstance(metrics_value, str):
try:
old_metrics = json.loads(metrics_value)
if isinstance(old_metrics, dict):
if "status" in old_metrics:
old_metrics = old_metrics.copy()
del old_metrics["status"]
record["metrics"] = old_metrics
else:
record["metrics"] = {}
except:
record["metrics"] = {}
else:
# metrics_by_tracker is already the parsed dict
if isinstance(metrics_by_tracker, dict):
if "status" in metrics_by_tracker:
metrics_by_tracker = metrics_by_tracker.copy()
del metrics_by_tracker["status"]
record["metrics"] = metrics_by_tracker
else:
record["metrics"] = {}
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, failed to aggregate metrics for object="{object_value}", '
f'exception="{str(e)}"'
)
# flx thresholds lookup
flx_thresholds_lookup(
object_value,
key_value,
record,
thresholds_collection_dict,
)
logger.debug(
f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"'
)
# flx check dynamic thresholds
threshold_alert, threshold_messages, threshold_scores = (
flx_check_dynamic_thresholds(
logger,
record.get("dynamic_thresholds", {}),
record.get("metrics", {}),
)
)
logger.debug(
f'instance_id={self.instance_id}, result function flx_check_dynamic_thresholds object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"'
)
# flx drilldown searches lookup
try:
flx_drilldown_searches_lookup(
tenant_id,
record.get("tracker_name", ""),
record.get("account", "local"),
record,
drilldown_searches_collection_dict,
)
logger.debug(
f'instance_id={self.instance_id}, drilldown_search="{record.get("drilldown_search", "")}", drilldown_search_earliest="{record.get("drilldown_search_earliest", "")}", drilldown_search_latest="{record.get("drilldown_search_latest", "")}", drilldown_searches="{json.dumps(record.get("drilldown_searches", []), indent=2)}"'
)
except Exception as e:
logger.error(f"instance_id={self.instance_id}, Error in flx_drilldown_searches_lookup: {str(e)}")
# flx default metrics lookup
try:
flx_default_metrics_lookup(
tenant_id,
record.get("tracker_name", ""),
record,
default_metrics_collection_dict,
)
logger.debug(
f'instance_id={self.instance_id}, default_metric="{record.get("default_metric", "")}"'
)
except Exception as e:
logger.error(f"instance_id={self.instance_id}, Error in flx_default_metrics_lookup: {str(e)}")
# Determine number of trackers to decide if we need prefix
num_trackers = 1
if "tracker_name" in record:
try:
tracker_name_value = record.get("tracker_name")
if tracker_name_value:
if isinstance(tracker_name_value, str):
try:
tracker_names = json.loads(tracker_name_value)
if isinstance(tracker_names, list):
num_trackers = len(tracker_names)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be comma-separated string
if "," in tracker_name_value:
num_trackers = len([t.strip() for t in tracker_name_value.split(",")])
elif isinstance(tracker_name_value, list):
num_trackers = len(tracker_name_value)
except Exception:
pass
# Aggregate status_description: concatenate all trackers' descriptions
if "status_description" in record:
try:
status_desc_value = record.get("status_description")
if status_desc_value:
status_desc_by_tracker = None
if isinstance(status_desc_value, str):
try:
status_desc_by_tracker = json.loads(status_desc_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format string, keep as-is
pass
elif isinstance(status_desc_value, dict):
status_desc_by_tracker = status_desc_value
if status_desc_by_tracker and isinstance(status_desc_by_tracker, dict):
# Check if it's tracker-keyed format (all values are strings) or old format
status_descriptions = []
is_tracker_keyed = False
for tracker_name, desc in status_desc_by_tracker.items():
if isinstance(desc, str):
# Tracker-keyed format
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
status_descriptions.append(f"{tracker_name}: {desc}")
else:
status_descriptions.append(desc)
is_tracker_keyed = True
else:
# Not tracker-keyed format
break
if is_tracker_keyed and status_descriptions:
# Update record with aggregated status_description
record["status_description"] = " | ".join(status_descriptions)
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, failed to aggregate status_description for object="{object_value}", '
f'exception="{str(e)}"'
)
# Aggregate status_description_short: concatenate all trackers' descriptions
if "status_description_short" in record:
try:
status_desc_short_value = record.get("status_description_short")
if status_desc_short_value:
status_desc_short_by_tracker = None
if isinstance(status_desc_short_value, str):
try:
status_desc_short_by_tracker = json.loads(status_desc_short_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format string, keep as-is
pass
elif isinstance(status_desc_short_value, dict):
status_desc_short_by_tracker = status_desc_short_value
if status_desc_short_by_tracker and isinstance(status_desc_short_by_tracker, dict):
# Check if it's tracker-keyed format
status_descriptions_short = []
is_tracker_keyed = False
for tracker_name, desc in status_desc_short_by_tracker.items():
if isinstance(desc, str):
# Tracker-keyed format
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
status_descriptions_short.append(f"{tracker_name}: {desc}")
else:
status_descriptions_short.append(desc)
is_tracker_keyed = True
else:
# Not tracker-keyed format
break
if is_tracker_keyed and status_descriptions_short:
# Update record with aggregated status_description_short
record["status_description_short"] = " | ".join(status_descriptions_short)
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, failed to aggregate status_description_short for object="{object_value}", '
f'exception="{str(e)}"'
)
# Aggregate tracker_name: convert JSON array to comma-separated string for display
if "tracker_name" in record:
try:
tracker_name_value = record.get("tracker_name")
if tracker_name_value:
if isinstance(tracker_name_value, str):
try:
tracker_names = json.loads(tracker_name_value)
if isinstance(tracker_names, list):
# Convert array to comma-separated string
record["tracker_name"] = ", ".join(tracker_names)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format string, keep as-is
pass
elif isinstance(tracker_name_value, list):
# Already a list, convert to comma-separated string
record["tracker_name"] = ", ".join(tracker_name_value)
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, failed to aggregate tracker_name for object="{object_value}", '
f'exception="{str(e)}"'
)
# Aggregate object_description: concatenate all trackers' descriptions
if "object_description" in record:
try:
object_desc_value = record.get("object_description")
if object_desc_value:
object_desc_by_tracker = None
if isinstance(object_desc_value, str):
try:
object_desc_by_tracker = json.loads(object_desc_value)
except (json.JSONDecodeError, TypeError):
# If parsing fails, might be old format string, keep as-is
pass
elif isinstance(object_desc_value, dict):
object_desc_by_tracker = object_desc_value
if object_desc_by_tracker and isinstance(object_desc_by_tracker, dict):
# Check if it's tracker-keyed format (all values are strings) or old format
object_descriptions = []
is_tracker_keyed = False
for tracker_name, desc in object_desc_by_tracker.items():
if isinstance(desc, str):
# Tracker-keyed format
if desc:
# Only add prefix if multiple trackers
if num_trackers > 1:
object_descriptions.append(f"{tracker_name}: {desc}")
else:
object_descriptions.append(desc)
is_tracker_keyed = True
else:
# Not tracker-keyed format
break
if is_tracker_keyed and object_descriptions:
# Update record with aggregated object_description
record["object_description"] = " | ".join(object_descriptions)
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, failed to aggregate object_description for object="{object_value}", '
f'exception="{str(e)}"'
)
# call set_flx_status and define object_state and anomaly_reason (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 921-933)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_flx_status(
logger,
request_info.server_rest_uri,
request_info.system_authtoken,
tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
threshold_alert,
threshold_messages,
disruption_queue_collection,
disruption_queue_record,
source_handler="rest_handler",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
threshold_scores=threshold_scores,
vtenant_account=vtenant_conf,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
record["object_state"] = object_state
record["status_message"] = " | ".join(status_message)
record["status_message_json"] = status_message_json
record["anomaly_reason"] = "|".join(anomaly_reason)
# generate charts resources for this entity
if load_charts_resources:
try:
charts_resources = generate_charts_resources(
tenant_id=tenant_id,
component="flx",
object=object_value,
keyid=key_value,
anomaly_reason=anomaly_reason,
vtenant_conf=vtenant_conf,
service=service
)
record["charts_resources"] = charts_resources
except Exception as e:
logger.debug(f"Failed to generate charts for FLX entity {key_value}: {str(e)}")
record["charts_resources"] = []
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
record["latest_flip_time_human"] = convert_epoch_to_datetime(
latest_flip_time_human
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-fqm
#
# get record fields depending on the component
elif component == "fqm":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# get outliers
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(record.get("OutliersDisabled", 0))
except:
OutliersDisabled = 0
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# Get logical group information
# get logical group information: object_group_key
object_group_key = record.get("object_group_key", "")
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
object_logical_group_dict = logical_coll_dict.get(
object_group_key, {}
)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# call get_outliers_status and define isOutlier (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 920-923)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# fqm thresholds lookup
fqm_thresholds_lookup(
object_value,
key_value,
record,
thresholds_collection_dict,
)
logger.debug(
f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"'
)
# fqm check dynamic thresholds
threshold_alert, threshold_messages, threshold_scores = (
fqm_check_dynamic_thresholds(
logger,
record.get("dynamic_thresholds", {}),
record.get("metrics", {}),
)
)
logger.debug(
f'instance_id={self.instance_id}, result function fqm_check_dynamic_thresholds object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"'
)
# call set_fqm_status and define object_state and anomaly_reason (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 921-933)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_fqm_status(
logger,
request_info.server_rest_uri,
request_info.system_authtoken,
tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
object_logical_group_dict,
threshold_alert,
threshold_messages,
disruption_queue_collection,
disruption_queue_record,
source_handler="rest_handler",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
threshold_scores=threshold_scores,
vtenant_account=vtenant_conf,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
record["object_state"] = object_state
record["status_message"] = " | ".join(status_message)
record["status_message_json"] = status_message_json
record["anomaly_reason"] = "|".join(anomaly_reason)
# generate charts resources for this entity
if load_charts_resources:
try:
charts_resources = generate_charts_resources(
tenant_id=tenant_id,
component="fqm",
object=object_value,
keyid=key_value,
anomaly_reason=anomaly_reason,
vtenant_conf=vtenant_conf,
service=service
)
record["charts_resources"] = charts_resources
except Exception as e:
logger.debug(f"Failed to generate charts for FQM entity {key_value}: {str(e)}")
record["charts_resources"] = []
# custom breakby fields support:
# 1 - try to load the content of fields_quality_summary (JSON as string)
# 2 - iterate over the JSON and look for fields metadata.* except the default metadata fields (datamodel, nodename, index, sourcetype)
# 3 - if one or more additional metadata fields are found, add them to the record as metadata_<fieldname> (instead of metadata.<fieldname>)
if "fields_quality_summary" in record:
try:
fields_quality_summary = json.loads(record["fields_quality_summary"])
for field in fields_quality_summary:
if field.startswith("metadata."):
if field not in ["metadata.datamodel", "metadata.nodename", "metadata.index", "metadata.sourcetype"]:
newfield_name = field.replace("metadata.", "metadata_")
record[f"{newfield_name}"] = fields_quality_summary[field]
except:
pass
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
record["latest_flip_time_human"] = convert_epoch_to_datetime(
latest_flip_time_human
)
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
#
# splk-wlk
#
# get record fields depending on the component
elif component == "wlk":
# first check blocklist
if (
datagen_collection_blocklist_not_regex_dict
or datagen_collection_blocklist_regex_dict
):
append_record = apply_blocklist(
record,
datagen_collection_blocklist_not_regex_dict,
datagen_collection_blocklist_regex_dict,
)
if append_record:
# set overgroup, if not existing, overgroup is the value of group
if "overgroup" not in record:
record["overgroup"] = record.get("group")
# lookup app enablement
wlk_disabled_apps_lookup(
record.get("app"),
apps_enablement_collection_keys,
apps_enablement_collection_dict,
record,
)
# lookup versioning
wlk_versioning_lookup(
key_value,
versioning_collection_keys,
versioning_collection_dict,
record,
)
# lookup orphan
wlk_orphan_lookup(
key_value,
orphan_collection_keys,
orphan_collection_dict,
record,
)
# Only process if needed
if record.get("app_is_enabled") == "False":
append_record = False
else:
# if mode_view is full, replace metrics with metrics_extended and remove metrics_extended
if mode_view == "full":
record["metrics"] = record.get("metrics_extended", "{}")
if "metrics_extended" in record:
del record["metrics_extended"]
# get outliers
try:
isOutlier = int(record.get("isOutlier", 0))
except:
isOutlier = 0
try:
OutliersDisabled = int(
record.get("OutliersDisabled", 0)
)
except:
OutliersDisabled = 0
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
)
# get monitoring time policy and rules (new fields)
monitoring_time_policy = record.get("monitoring_time_policy", None)
# if unset yet, use the tenant level and add to the record
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
monitoring_time_policy = default_monitoring_time_policy
record["monitoring_time_policy"] = default_monitoring_time_policy
monitoring_time_rules = record.get("monitoring_time_rules", None)
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
# Falls back to legacy fields if new fields are not set
(
isUnderMonitoring,
monitoring_anomaly_reason,
isUnderMonitoringMsg,
) = get_monitoring_time_status(
monitoring_time_policy,
monitoring_time_rules,
)
# call get_outliers_status and define isOutlier (with hybrid scoring)
# Note: score and score_outliers are already extracted from scores_dict above (lines 921-933)
isOutlier = get_outliers_status(
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
)
# call set_wlk_status and define object_state and anomaly_reason (with hybrid scoring)
(
object_state,
status_message,
status_message_json,
anomaly_reason,
) = set_wlk_status(
logger,
request_info.server_rest_uri,
request_info.system_authtoken,
tenant_id,
record,
isOutlier,
isUnderMonitoring,
isUnderMonitoringMsg,
disruption_queue_collection,
disruption_queue_record,
source_handler="rest_handler",
monitoring_anomaly_reason=monitoring_anomaly_reason,
score=score,
score_outliers=score_outliers,
vtenant_account=vtenant_conf,
)
logger.debug(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
)
# insert our main fields
record["object_state"] = object_state
record["status_message"] = " | ".join(status_message)
record["status_message_json"] = status_message_json
record["anomaly_reason"] = "|".join(anomaly_reason)
# generate charts resources for this entity
if load_charts_resources:
try:
charts_resources = generate_charts_resources(
tenant_id=tenant_id,
component="wlk",
object=object_value,
keyid=key_value,
anomaly_reason=anomaly_reason,
vtenant_conf=vtenant_conf,
service=service
)
record["charts_resources"] = charts_resources
except Exception as e:
logger.debug(f"Failed to generate charts for WLK entity {key_value}: {str(e)}")
record["charts_resources"] = []
# get and convert latest_flip_time from epoch
latest_flip_time_human = record.get("latest_flip_time", 0)
try:
latest_flip_time_human = float(latest_flip_time_human)
except:
latest_flip_time_human = 0
record["latest_flip_time_human"] = (
convert_epoch_to_datetime(latest_flip_time_human)
)
# convert last_time_seen from epoch
last_seen = convert_epoch_to_datetime(
record.get("last_seen", 0)
)
record["last_seen_human"] = last_seen
# sla_timer
get_sla_timer(record, sla_classes, sla_default_class)
if append_record:
#
# if we do not have a value for object_state or object_state is empty, define to red
#
if not record.get("object_state", None):
record["object_state"] = "red"
#
# state icon code
#
record["state_icon_code"] = define_state_icon_code(record)
#
# End, add to the processed_records list
#
processed_records.append(record)
# log debug only
logger.debug(f'instance_id={self.instance_id}, record="{json.dumps(record, indent=2)}"')
#
# End per component processing
#
except Exception as e:
logger.error(
f'instance_id={self.instance_id}, tenant_id="{tenant_id}", component="{component}", Error processing record, record="{json.dumps(record, indent=2)}", exception="{str(e)}"'
)
continue # Proceed with next record
try:
logger.info(
f'instance_id={self.instance_id}, collection_name="{data_collection_name}", page="{page}", size="{size}", collection_count="{total_record_count}", last_page="{last_page}"'
)
filtered_records = filter_records(processed_records, query_parameters_json)
"""
for dev debug only
if len(filtered_records) > 0:
for record in filtered_records[:10]:
logger.debug(f'record="{json.dumps(record, indent=2)}"')
else:
logger.debug(f"no results found")
"""
# log info
logger.info(
f'instance_id="{self.instance_id}", trackme_rest_handler_component_user has terminated, run_time="{round((time.time() - start), 3)}"'
)
if pagination_mode == "remote":
return {
"payload": {
"last_page": last_page,
"data": filtered_records,
},
"status": 200,
}
elif pagination_mode == "local":
return {
"payload": filtered_records,
"status": 200,
}
except Exception as e:
response = {
"action": "failure",
"response": f'an exception was encountered, exception="{str(e)}"',
}
logger.error(f"instance_id={self.instance_id}, {json.dumps(response)}")
return {"payload": response, "status": 500}
# Get the component data with pagination and progressive load capabilities
def post_load_component_data_full(self, request_info, **kwargs):
describe = False
try:
resp_dict = json.loads(str(request_info.raw_args["payload"]))
except Exception as e:
resp_dict = None
if resp_dict is not None:
try:
describe = resp_dict["describe"]
if describe in ("true", "True"):
describe = True
except Exception as e:
describe = False
if not describe:
# tenant_id
try:
tenant_id = resp_dict["tenant_id"]
except Exception as e:
return {
"payload": {"error": "tenant_id is required"},
"status": 500,
}
# component
try:
component = resp_dict["component"]
if component not in (
"dsm",
"dhm",
"mhm",
"flx",
"fqm",
"wlk",
):
return {
"payload": {"error": "component is invalid"},
"status": 500,
}
except Exception as e:
return {
"payload": {"error": "component is required"},
"status": 500,
}
else:
describe = True
# if describe is requested, show the usage
if describe:
response = {
"describe": "This endpoint retrieves and returns the full component data with pagination and multithreading, it calls the load_component_data endpoint accordingly, it requires a POST call using data and the following options:",
"resource_desc": "Retrieve the full component data with pagination and multithreading",
"resource_spl_example": "| trackme url=\"/services/trackme/v2/component/load_component_data_full\" mode=\"post\" body=\"{'tenant_id': 'mytenant', 'component': 'flx'}\"",
"options": [
{
"tenant_id": "tenant identifier",
"component": "component identifier, valid options are: flx, dsm, dhm, mhm, wlk, fqm",
}
],
}
return {"payload": response, "status": 200}
# set loglevel
loglevel = trackme_getloglevel(
request_info.system_authtoken, request_info.server_rest_port
)
logger.setLevel(loglevel)
# performance counter
start = time.time()
params = {
"tenant_id": tenant_id,
"component": component,
"page": 1,
"size": 0,
}
# Define an header for requests authenticated communications with splunkd
header = {
"Authorization": f"Splunk {request_info.system_authtoken}",
"Content-Type": "application/json",
}
# Add the vtenant account
url = f"{request_info.server_rest_uri}/services/trackme/v2/component/load_component_data"
# results_records list
results_records = []
# Proceed
try:
response = requests.get(
url,
headers=header,
params=params,
verify=False,
timeout=600,
)
if response.status_code not in (200, 201, 204):
msg = f'get component has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
raise Exception(msg)
else:
response_json = response.json()
last_page = response_json.get("last_page", 1)
data = response_json.get("data", [])
# add the data to the data_records
for record in data:
results_records.append(record)
logger.info(
f"retrieved page 1 with {len(data)} records, last_page={last_page}"
)
except Exception as e:
msg = f'get component has failed, exception="{str(e)}"'
logger.error(msg)
return {"payload": {"response": msg}, "status": 500}
# run_time
run_time = round((time.time() - start), 3)
# return the response
logger.info(
f'context="perf", no_records="{len(results_records)}", run_time="{run_time}", tenant_id="{tenant_id}", component="{component}"'
)
return {
"payload": {
"data": results_records,
"entities": len(results_records),
"run_time": run_time,
},
"status": 200,
}
def get_chart_labels_and_descriptions():
"""
Function to get chart labels and descriptions mapping
Returns:
Dictionary mapping chart types to their labels, descriptions, and chart types
"""
return {
"latency": {
"label": "Event Latency",
"description": "Event latency over time showing data ingestion delays",
"chart_type": "line"
},
"delay": {
"label": "Event Delay",
"description": "Event delay over time showing time between event occurrence and ingestion",
"chart_type": "line"
},
"volume": {
"label": "Event Volume",
"description": "Event volume over time showing the number of events",
"chart_type": "line"
},
"hosts_dcount": {
"label": "Hosts Count",
"description": "Distinct host count over time",
"chart_type": "line"
},
"data_sampling_anomaly": {
"label": "Data Sampling Anomaly",
"description": "Data sampling model match percentage over time",
"chart_type": "bar"
},
"flx_status": {
"label": "FLX Status",
"description": "FLX entity status over time",
"chart_type": "line"
},
"incidents_events": {
"label": "Incident Events",
"description": "Stateful alert incidents timeline",
"chart_type": "bar"
},
"flipping_events": {
"label": "State Flipping Events",
"description": "Entity state changes over time",
"chart_type": "bar"
},
"state_events": {
"label": "State Events",
"description": "Entity state distribution over time",
"chart_type": "bar"
}
}
def generate_charts_resources(tenant_id, component, object, keyid, anomaly_reason, vtenant_conf, service):
"""
Function to generate chart resources for an entity based on component type and anomaly reasons
Args:
tenant_id: The tenant ID
component: The component type (dsm, dhm, mhm, flx, fqm, wlk)
object: The object name
keyid: The object key ID
anomaly_reason: List of anomaly reasons
vtenant_conf: Virtual tenant configuration
service: Splunk service object
Returns:
List of chart dictionaries with chart_label, chart_description, and chart_search
"""
charts = []
chart_labels = get_chart_labels_and_descriptions()
try:
# Parse anomaly_reason if it's a string
if isinstance(anomaly_reason, str):
anomaly_reason = anomaly_reason.split("|") if anomaly_reason else []
elif not isinstance(anomaly_reason, list):
anomaly_reason = []
# Normalize anomaly_reason (remove empty strings)
anomaly_reason = [reason for reason in anomaly_reason if reason and reason.strip()]
# Create object_category for chart search
object_category = f"splk-{component}"
# Check tenant-level feature enablement
try:
outliers_enabled = int(vtenant_conf.get(f'mloutliers_{component}', 1)) == 1
except Exception as e:
outliers_enabled = False
try:
sampling_enabled = int(vtenant_conf.get('sampling', 1)) == 1
except Exception as e:
sampling_enabled = False
# Component-specific chart logic
if component in ("dsm", "dhm"):
# Always include basic charts for DSM/DHM
for chart_type in ["latency", "delay", "volume"]:
chart_search = get_chart_search(
chart_type=chart_type,
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid
)
if chart_search:
chart_info = chart_labels.get(chart_type, {})
charts.append({
"chart_label": chart_info.get("label", chart_type),
"chart_description": chart_info.get("description", f"{chart_type} chart"),
"chart_search": chart_search,
"chart_type": chart_info.get("chart_type", "line")
})
# DSM-specific charts
if component == "dsm":
# hosts_dcount chart
chart_search = get_chart_search(
chart_type="hosts_dcount",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid
)
if chart_search:
chart_info = chart_labels.get("hosts_dcount", {})
charts.append({
"chart_label": chart_info.get("label", "Hosts Count"),
"chart_description": chart_info.get("description", "Distinct host count over time"),
"chart_search": chart_search,
"chart_type": chart_info.get("chart_type", "line")
})
# data_sampling_anomaly chart (if anomaly present and sampling enabled)
if "data_sampling_anomaly" in anomaly_reason and sampling_enabled:
chart_search = get_chart_search(
chart_type="data_sampling_anomaly",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid
)
if chart_search:
chart_info = chart_labels.get("data_sampling_anomaly", {})
charts.append({
"chart_label": chart_info.get("label", "Data Sampling Anomaly"),
"chart_description": chart_info.get("description", "Data sampling model match percentage over time"),
"chart_search": chart_search,
"chart_type": chart_info.get("chart_type", "bar")
})
elif component == "flx":
# FLX status chart
chart_search = get_chart_search(
chart_type="flx_status",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid
)
if chart_search:
chart_info = chart_labels.get("flx_status", {})
charts.append({
"chart_label": chart_info.get("label", "FLX Status"),
"chart_description": chart_info.get("description", "FLX entity status over time"),
"chart_search": chart_search,
"chart_type": chart_info.get("chart_type", "line")
})
# Dynamic FLX metrics
try:
flx_metrics = flx_get_metrics_catalog_for_object_id(
None, service, tenant_id, keyid, timerange_charts="24h"
)
if flx_metrics:
for flx_metric in flx_metrics:
chart_search = get_chart_search(
chart_type="flx_metric_group",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid,
metric_list=[flx_metric]
)
if chart_search:
# Determine chart type based on metrics (bar if any metric contains "count", otherwise line)
chart_type = "bar" if "count" in flx_metric.lower() else "line"
charts.append({
"chart_label": f"FLX Metric: {flx_metric}",
"chart_description": f"FLX metrics over time for {flx_metric}",
"chart_search": chart_search,
"chart_type": chart_type
})
except Exception as e:
logger.debug(f"Failed to get FLX metrics for {keyid}: {str(e)}")
elif component == "fqm":
# Dynamic FQM metrics
try:
fqm_metrics = fqm_get_metrics_catalog_for_object_id(
None, service, tenant_id, keyid, timerange_charts="24h"
)
if fqm_metrics:
for fqm_metric in fqm_metrics:
chart_search = get_chart_search(
chart_type="fqm_metric_group",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid,
metric_list=[fqm_metric]
)
if chart_search:
# Determine chart type based on metrics (bar if any metric contains "count", otherwise line)
chart_type = "bar" if "count" in fqm_metric.lower() else "line"
charts.append({
"chart_label": f"FQM Metric: {fqm_metric}",
"chart_description": f"FQM metrics over time for {fqm_metric}",
"chart_search": chart_search,
"chart_type": chart_type
})
except Exception as e:
logger.debug(f"Failed to get FQM metrics for {keyid}: {str(e)}")
elif component == "wlk":
# Dynamic WLK metrics
try:
wlk_metrics = wlk_get_metrics_catalog_for_object_id(
None, service, tenant_id, keyid, timerange_charts="24h"
)
if wlk_metrics:
for wlk_metric in wlk_metrics:
chart_search = get_chart_search(
chart_type="wlk_metric_group",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid,
metric_list=[wlk_metric]
)
if chart_search:
# Determine chart type based on metrics (bar if any metric contains "count", otherwise line)
chart_type = "bar" if "count" in wlk_metric.lower() else "line"
charts.append({
"chart_label": f"WLK Metric: {wlk_metric}",
"chart_description": f"WLK metrics over time for {wlk_metric}",
"chart_search": chart_search,
"chart_type": chart_type
})
except Exception as e:
logger.debug(f"Failed to get WLK metrics for {keyid}: {str(e)}")
# ML Outliers charts (for applicable components)
if component in ("dsm", "dhm", "flx", "fqm", "wlk") and outliers_enabled:
logger.debug(f'handling mloutliers_detection for {component} entity {keyid}, outliers_enabled={outliers_enabled}')
try:
class _HelperAdapter:
def __init__(self, base_logger):
self._logger = base_logger
def log_debug(self, message):
self._logger.debug(message)
def log_info(self, message):
self._logger.info(message)
def log_error(self, message):
self._logger.error(message)
helper_adapter = _HelperAdapter(logger)
ml_models = get_mlmodels_from_kvstore(
helper_adapter, service, tenant_id, component, object, keyid
)
if ml_models:
for model_id in ml_models:
chart_search = get_chart_search(
chart_type="ml_outliers",
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid,
model_id=model_id
)
if chart_search:
charts.append({
"chart_label": f"ML Outliers: {model_id}",
"chart_description": f"Machine learning outliers detection for model {model_id}",
"chart_search": chart_search,
"chart_type": "line"
})
except Exception as e:
logger.error(f"Failed to get ML models for {keyid}: {str(e)}")
# Common charts for all components
for chart_type in ["incidents_events", "flipping_events", "state_events"]:
chart_search = get_chart_search(
chart_type=chart_type,
tenant_id=tenant_id,
object_category=object_category,
object=object,
keyid=keyid
)
if chart_search:
chart_info = chart_labels.get(chart_type, {})
charts.append({
"chart_label": chart_info.get("label", chart_type),
"chart_description": chart_info.get("description", f"{chart_type} chart"),
"chart_search": chart_search,
"chart_type": chart_info.get("chart_type", "line")
})
except Exception as e:
logger.error(f"Error generating charts for {component} entity {keyid}: {str(e)}")
# Return empty list on error to not break the main response
return []
return charts