You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2510 lines
128 KiB
2510 lines
128 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
|
|
# Logging
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Networking
|
|
import urllib3
|
|
import requests
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_decision_maker.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
StreamingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# Import trackme libs
|
|
from trackme_libs import trackme_reqinfo, trackme_vtenant_account
|
|
|
|
# import TrackMe get data libs
|
|
from trackme_libs_get_data import (
|
|
get_feeds_datagen_kv_collection,
|
|
get_sampling_kv_collection,
|
|
search_kv_collection_restmode,
|
|
search_kv_collection_searchmode,
|
|
search_kv_collection_sdkmode,
|
|
)
|
|
|
|
# Import trackme decisionmaker libs
|
|
from trackme_libs_decisionmaker import (
|
|
convert_epoch_to_datetime,
|
|
get_monitoring_time_status,
|
|
get_outliers_status,
|
|
get_data_sampling_status,
|
|
get_future_status,
|
|
get_future_metrics_status,
|
|
get_is_under_dcount_host,
|
|
get_logical_groups_collection_records,
|
|
get_dsm_latency_status,
|
|
get_dsm_delay_status,
|
|
set_dsm_status,
|
|
set_dhm_status,
|
|
set_mhm_status,
|
|
set_flx_status,
|
|
set_fqm_status,
|
|
set_wlk_status,
|
|
apply_blocklist,
|
|
dynamic_priority_lookup,
|
|
dynamic_tags_lookup,
|
|
dynamic_sla_class_lookup,
|
|
get_sla_timer,
|
|
dsm_sampling_lookup,
|
|
sampling_anomaly_status,
|
|
flx_thresholds_lookup,
|
|
fqm_thresholds_lookup,
|
|
flx_check_dynamic_thresholds,
|
|
fqm_check_dynamic_thresholds,
|
|
flx_drilldown_searches_lookup,
|
|
flx_default_metrics_lookup,
|
|
calculate_score,
|
|
)
|
|
|
|
# import trackme libs disruption queue
|
|
from trackme_libs_disruption_queue import (
|
|
disruption_queue_lookup,
|
|
disruption_queue_update,
|
|
disruption_queue_get_duration,
|
|
)
|
|
|
|
# Import TrackMe splk-flx libs
|
|
from trackme_libs_splk_flx import trackme_flx_gen_metrics
|
|
|
|
# Import TrackMe splk-fqm libs
|
|
from trackme_libs_splk_fqm import trackme_fqm_gen_metrics
|
|
|
|
# Import TrackMe utils libs
|
|
from trackme_libs_utils import get_uuid
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class TrackMeDecisionMaker(StreamingCommand):
|
|
tenant_id = Option(
|
|
doc="""
|
|
**Syntax:** **tenant_id=****
|
|
**Description:** The tenant identifier.""",
|
|
require=True,
|
|
default=None,
|
|
)
|
|
|
|
component = Option(
|
|
doc="""
|
|
**Syntax:** **component=****
|
|
**Description:** Specify the TrackMe component.""",
|
|
require=True,
|
|
default=None,
|
|
validate=validators.Match("component", r"^(dsm|dhm|mhm|wlk|flx|fqm)$"),
|
|
)
|
|
|
|
"""
|
|
This function ensures that records have the same list of fields to allow Splunk to automatically extract these fields
|
|
If a given result does not have a given field, it will be added to the record as an empty value
|
|
"""
|
|
|
|
def generate_fields(self, records):
|
|
all_keys = set()
|
|
for record in records:
|
|
all_keys.update(record.keys())
|
|
|
|
for record in records:
|
|
for key in all_keys:
|
|
if key not in record:
|
|
record[key] = ""
|
|
yield record
|
|
|
|
def get_tenant_metric_idx(self):
|
|
# Define an header for requests authenticated communications with splunkd
|
|
header = {
|
|
"Authorization": "Splunk %s" % self._metadata.searchinfo.session_key,
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# get the index conf for this tenant
|
|
url = "%s/services/trackme/v2/vtenants/tenant_idx_settings" % (
|
|
self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
data = {"tenant_id": self.tenant_id, "idx_stanza": "trackme_metric_idx"}
|
|
|
|
# Retrieve and set the tenant idx, if any failure, logs and use the global index
|
|
try:
|
|
response = requests.post(
|
|
url,
|
|
headers=header,
|
|
data=json.dumps(data, indent=1),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
if response.status_code not in (200, 201, 204):
|
|
error_msg = f'instance_id={self.instance_id}, failed to retrieve the tenant metric index, response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
logging.error(error_msg)
|
|
raise Exception(error_msg)
|
|
else:
|
|
response_data = json.loads(json.dumps(response.json(), indent=1))
|
|
tenant_trackme_metric_idx = response_data["trackme_metric_idx"]
|
|
except Exception as e:
|
|
error_msg = (
|
|
f'instance_id={self.instance_id}, failed to retrieve the tenant metric index, exception="{str(e)}"'
|
|
)
|
|
logging.error(error_msg)
|
|
raise Exception(error_msg)
|
|
|
|
return tenant_trackme_metric_idx
|
|
|
|
"""
|
|
Stream function
|
|
"""
|
|
|
|
def stream(self, records):
|
|
# Start performance counter
|
|
start = time.time()
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# set instance_id
|
|
self.instance_id = get_uuid()
|
|
|
|
# Get virtual tenant account
|
|
vtenant_conf = trackme_vtenant_account(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
|
|
# get metric index
|
|
metric_index = self.get_tenant_metric_idx()
|
|
|
|
#
|
|
# System level settings
|
|
#
|
|
|
|
system_future_tolerance = float(
|
|
reqinfo["trackme_conf"]["splk_general"][
|
|
"splk_general_feeds_future_tolerance"
|
|
]
|
|
)
|
|
|
|
#
|
|
# System level default minimal disruption period
|
|
#
|
|
|
|
default_disruption_min_time_sec = int(
|
|
vtenant_conf["default_disruption_min_time_sec"]
|
|
)
|
|
|
|
#
|
|
# Tenant level default monitoring time policy
|
|
#
|
|
|
|
try:
|
|
default_monitoring_time_policy = vtenant_conf["monitoring_time_policy"]
|
|
except Exception as e:
|
|
default_monitoring_time_policy = "all_time"
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_priority_collection_records"
|
|
|
|
# dynamic priority, for all components
|
|
# get priority collection
|
|
priority_collection_name = (
|
|
f"kv_trackme_{self.component}_priority_tenant_{self.tenant_id}"
|
|
)
|
|
priority_collection = self.service.kvstore[priority_collection_name]
|
|
(
|
|
priority_records,
|
|
priority_collection_keys,
|
|
priority_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, priority_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_tags_collection_records"
|
|
|
|
# get tags collection
|
|
tags_collection_name = (
|
|
f"kv_trackme_{self.component}_tags_tenant_{self.tenant_id}"
|
|
)
|
|
tags_collection = self.service.kvstore[tags_collection_name]
|
|
(
|
|
tags_records,
|
|
tags_collection_keys,
|
|
tags_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, tags_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_sla_collection_records"
|
|
|
|
# dynamic sla_class, for all components
|
|
# get sla collection
|
|
sla_collection_name = f"kv_trackme_{self.component}_sla_tenant_{self.tenant_id}"
|
|
sla_collection = self.service.kvstore[sla_collection_name]
|
|
(
|
|
sla_records,
|
|
sla_collection_keys,
|
|
sla_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, sla_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_disruption_queue_collection_records"
|
|
|
|
# get disruption queue collection
|
|
disruption_queue_collection_name = (
|
|
f"kv_trackme_common_disruption_queue_tenant_{self.tenant_id}"
|
|
)
|
|
disruption_queue_collection = self.service.kvstore[
|
|
disruption_queue_collection_name
|
|
]
|
|
(
|
|
disruption_queue_records,
|
|
disruption_queue_collection_keys,
|
|
disruption_queue_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, disruption_queue_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, disruption_queue_collection_dict="{json.dumps(disruption_queue_collection_dict, indent=2)}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
#
|
|
# SLA timer
|
|
#
|
|
|
|
sla_classes = {}
|
|
sla_default_class = None
|
|
|
|
sla_classes = reqinfo["trackme_conf"]["sla"]["sla_classes"]
|
|
# try loading the JSON
|
|
try:
|
|
sla_classes = json.loads(sla_classes)
|
|
sla_default_class = reqinfo["trackme_conf"]["sla"]["sla_default_class"]
|
|
if not len(sla_default_class) > 0 or sla_default_class not in sla_classes:
|
|
sla_default_class = "silver"
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, Invalid sla_default_class="{sla_default_class}", this SLA class is not part of the SLA classes, applying fallback configuration'
|
|
)
|
|
|
|
except:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, Error loading sla_classes JSON, please check the configuration, the JSON is not valid JSON, applying fallback configuration, exception="{str(e)}"'
|
|
)
|
|
sla_classes = json.loads(
|
|
'{"gold": {"sla_threshold": 14400, "rank": 3}, "silver": {"sla_threshold": 86400, "rank": 2}, "platinum": {"sla_threshold": 172800, "rank": 1}}'
|
|
)
|
|
sla_default_class = "silver"
|
|
|
|
# retrieve the score for the tenant and component
|
|
scores_dict = calculate_score(self.service, self.tenant_id, self.component)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, scores_dict="{json.dumps(scores_dict, indent=2)}"'
|
|
)
|
|
|
|
#
|
|
# splk-dsm specific collections
|
|
#
|
|
|
|
if self.component == "dsm":
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_sampling_collection_records"
|
|
|
|
# Data sampling
|
|
sampling_collection_name = (
|
|
f"kv_trackme_dsm_data_sampling_tenant_{self.tenant_id}"
|
|
)
|
|
sampling_collection = self.service.kvstore[sampling_collection_name]
|
|
sampling_records, sampling_collection_keys, sampling_collection_dict = (
|
|
get_sampling_kv_collection(
|
|
sampling_collection, sampling_collection_name
|
|
)
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# dhm specific
|
|
|
|
if self.component == "dhm":
|
|
macro_name = (
|
|
f"trackme_dhm_default_splk_dhm_alert_policy_tenant_{self.tenant_id}"
|
|
)
|
|
macro_current = self.service.confs["macros"][macro_name]
|
|
default_splk_dhm_alerting_policy = macro_current.content.get("definition")
|
|
# remove double quotes from default_splk_dhm_alerting_policy
|
|
default_splk_dhm_alerting_policy = default_splk_dhm_alerting_policy.replace(
|
|
'"', ""
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, default_splk_dhm_alerting_policy="{default_splk_dhm_alerting_policy}"'
|
|
)
|
|
|
|
#
|
|
# component specific collections
|
|
#
|
|
|
|
if self.component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk"]:
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_datagen_collection_records"
|
|
|
|
# datagen
|
|
datagen_collection_name = (
|
|
f"kv_trackme_{self.component}_allowlist_tenant_{self.tenant_id}"
|
|
)
|
|
datagen_collection = self.service.kvstore[datagen_collection_name]
|
|
(
|
|
datagen_records,
|
|
datagen_collection_keys,
|
|
datagen_collection_dict,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
) = get_feeds_datagen_kv_collection(
|
|
datagen_collection, datagen_collection_name, self.component
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, datagen_collection_dict="{json.dumps(datagen_collection_dict, indent=2)}"'
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, datagen_collection_blocklist_not_regex_dict="{json.dumps(datagen_collection_blocklist_not_regex_dict, indent=2)}"'
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, datagen_collection_blocklist_regex_dict="{json.dumps(datagen_collection_blocklist_regex_dict, indent=2)}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
#
|
|
# splk-flx specific collections
|
|
#
|
|
|
|
if self.component == "flx":
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_thresholds_collection_records"
|
|
|
|
# Thresholds
|
|
thresholds_collection_name = (
|
|
f"kv_trackme_flx_thresholds_tenant_{self.tenant_id}"
|
|
)
|
|
thresholds_collection = self.service.kvstore[thresholds_collection_name]
|
|
(
|
|
thresholds_records,
|
|
thresholds_collection_keys,
|
|
thresholds_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, thresholds_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_drilldown_searches_collection_records"
|
|
|
|
# Drilldown searches
|
|
drilldown_searches_collection_name = (
|
|
f"kv_trackme_flx_drilldown_searches_tenant_{self.tenant_id}"
|
|
)
|
|
drilldown_searches_collection = self.service.kvstore[drilldown_searches_collection_name]
|
|
(
|
|
drilldown_searches_records,
|
|
drilldown_searches_collection_keys,
|
|
drilldown_searches_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, drilldown_searches_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, drilldown_searches_collection_dict="{json.dumps(drilldown_searches_collection_dict, indent=2)}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_default_metrics_collection_records"
|
|
|
|
# Default metrics
|
|
default_metrics_collection_name = (
|
|
f"kv_trackme_flx_default_metric_tenant_{self.tenant_id}"
|
|
)
|
|
default_metrics_collection = self.service.kvstore[default_metrics_collection_name]
|
|
(
|
|
default_metrics_records,
|
|
default_metrics_collection_keys,
|
|
default_metrics_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, default_metrics_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, default_metrics_collection_dict="{json.dumps(default_metrics_collection_dict, indent=2)}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
#
|
|
# splk-fqm specific collections
|
|
#
|
|
|
|
if self.component == "fqm":
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_thresholds_collection_records"
|
|
|
|
# Thresholds
|
|
thresholds_collection_name = (
|
|
f"kv_trackme_fqm_thresholds_tenant_{self.tenant_id}"
|
|
)
|
|
thresholds_collection = self.service.kvstore[thresholds_collection_name]
|
|
(
|
|
thresholds_records,
|
|
thresholds_collection_keys,
|
|
thresholds_collection_dict,
|
|
last_page,
|
|
) = search_kv_collection_sdkmode(
|
|
logging, self.service, thresholds_collection_name, page=1, page_count=0, orderby="keyid"
|
|
)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, thresholds_collection_dict="{json.dumps(thresholds_collection_dict, indent=2)}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
#
|
|
# Virtual tenant account settings
|
|
#
|
|
|
|
# outliers tenant level settings
|
|
# outliers tenant level settings (deprecated - kept for backward compatibility)
|
|
# These are no longer used with score-based approach, but kept for backward compatibility
|
|
tenant_outliers_set_state = int(vtenant_conf.get("outliers_set_state", 1))
|
|
tenant_data_sampling_set_state = int(vtenant_conf.get("data_sampling_set_state", 1))
|
|
|
|
#
|
|
# Logical groups collection records
|
|
#
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "get_logical_groups_collection_records"
|
|
|
|
logical_group_coll = self.service.kvstore[
|
|
f"kv_trackme_common_logical_group_tenant_{self.tenant_id}"
|
|
]
|
|
|
|
(
|
|
logical_coll_records,
|
|
logical_coll_dict,
|
|
logical_coll_members_list,
|
|
logical_coll_members_dict,
|
|
logical_coll_count,
|
|
) = get_logical_groups_collection_records(logical_group_coll)
|
|
|
|
# log debug
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, function get_logical_groups_collection_records, logical_coll_dict="{json.dumps(logical_coll_dict, indent=2)}", logical_coll_count="{logical_coll_count}"'
|
|
)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# Process records
|
|
processed_records = []
|
|
records_count = 0
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "process_records"
|
|
|
|
for record in records:
|
|
records_count += 1
|
|
try:
|
|
new_record = {}
|
|
|
|
# append_record boolean, True by default unless specific use cases
|
|
append_record = True
|
|
|
|
# get object_value and key
|
|
object_value = record.get("object", None)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, object="{object_value}", record="{json.dumps(record, indent=2)}"'
|
|
)
|
|
|
|
# save the current value of object_state in the record as kvcurrent_object_state, we manipulate real state calculations
|
|
# and we need the original state in some conditions (sla)
|
|
record["kvcurrent_object_state"] = record.get("object_state", "N/A")
|
|
|
|
# The value for key is normally in the field keyid, but in some cases it is in the field key or _key
|
|
# use keyid, key, _key in that order of preference
|
|
if "keyid" in record:
|
|
key_value = record.get("keyid", None)
|
|
elif "object_id" in record:
|
|
key_value = record.get("object_id", None)
|
|
elif "key" in record:
|
|
key_value = record.get("key", None)
|
|
elif "_key" in record:
|
|
key_value = record.get("_key", None)
|
|
else:
|
|
key_value = None
|
|
|
|
# get the score for the object and add to the record
|
|
try:
|
|
score = int(scores_dict.get(key_value, {}).get("score", 0))
|
|
except:
|
|
score = 0
|
|
try:
|
|
score_outliers = int(scores_dict.get(key_value, {}).get("score_outliers", 0))
|
|
except:
|
|
score_outliers = 0
|
|
score_source = scores_dict.get(key_value, {}).get("score_source", [])
|
|
record["score"] = score
|
|
record["score_outliers"] = score_outliers
|
|
record["score_source"] = score_source
|
|
|
|
#
|
|
# Dynamic priority
|
|
#
|
|
|
|
dynamic_priority_lookup(
|
|
key_value,
|
|
priority_collection_keys,
|
|
priority_collection_dict,
|
|
record,
|
|
)
|
|
|
|
#
|
|
# Dynamic tags
|
|
#
|
|
|
|
dynamic_tags_lookup(
|
|
key_value,
|
|
tags_collection_keys,
|
|
tags_collection_dict,
|
|
record,
|
|
)
|
|
|
|
#
|
|
# Dynamic sla_class
|
|
#
|
|
|
|
dynamic_sla_class_lookup(
|
|
key_value,
|
|
sla_collection_keys,
|
|
sla_collection_dict,
|
|
record,
|
|
)
|
|
|
|
#
|
|
# Disruption queue
|
|
#
|
|
|
|
# Aggregate disruption_min_time_sec: take maximum value across all trackers
|
|
aggregated_disruption_min_time_sec = default_disruption_min_time_sec
|
|
if "disruption_min_time_sec" in record:
|
|
try:
|
|
disruption_min_time_value = record.get("disruption_min_time_sec")
|
|
if disruption_min_time_value:
|
|
disruption_times_by_tracker = None
|
|
|
|
# Parse if it's a JSON string
|
|
if isinstance(disruption_min_time_value, str):
|
|
try:
|
|
disruption_times_by_tracker = json.loads(disruption_min_time_value)
|
|
except (json.JSONDecodeError, TypeError):
|
|
# If parsing fails, might be old format numeric value
|
|
try:
|
|
aggregated_disruption_min_time_sec = max(
|
|
default_disruption_min_time_sec,
|
|
int(float(disruption_min_time_value))
|
|
)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
elif isinstance(disruption_min_time_value, dict):
|
|
disruption_times_by_tracker = disruption_min_time_value
|
|
else:
|
|
# Numeric value (old format)
|
|
try:
|
|
aggregated_disruption_min_time_sec = max(
|
|
default_disruption_min_time_sec,
|
|
int(float(disruption_min_time_value))
|
|
)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# If tracker-keyed format, take maximum across all trackers
|
|
if disruption_times_by_tracker and isinstance(disruption_times_by_tracker, dict):
|
|
max_disruption_time = max(
|
|
int(float(v)) for v in disruption_times_by_tracker.values()
|
|
)
|
|
aggregated_disruption_min_time_sec = max(
|
|
default_disruption_min_time_sec,
|
|
max_disruption_time
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'failed to aggregate disruption_min_time_sec, exception="{str(e)}"'
|
|
)
|
|
|
|
disruption_queue_record = disruption_queue_lookup(
|
|
key_value,
|
|
disruption_queue_collection_keys,
|
|
disruption_queue_collection_dict,
|
|
aggregated_disruption_min_time_sec,
|
|
)
|
|
if disruption_queue_record:
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, disruption_queue_record="type={type(disruption_queue_record)}, {json.dumps(disruption_queue_record, indent=2)}"'
|
|
)
|
|
|
|
#
|
|
# splk-dsm
|
|
#
|
|
|
|
# get record fields depending on the component
|
|
if self.component == "dsm":
|
|
|
|
# first check blocklist
|
|
if (
|
|
datagen_collection_blocklist_not_regex_dict
|
|
or datagen_collection_blocklist_regex_dict
|
|
):
|
|
append_record = apply_blocklist(
|
|
record,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
)
|
|
|
|
if append_record:
|
|
|
|
# get outliers and data sampling
|
|
try:
|
|
isOutlier = int(record.get("isOutlier", 0))
|
|
except:
|
|
isOutlier = 0
|
|
|
|
try:
|
|
OutliersDisabled = int(record.get("OutliersDisabled", 0))
|
|
except:
|
|
OutliersDisabled = 0
|
|
|
|
try:
|
|
isAnomaly = int(record.get("isAnomaly", 0))
|
|
except:
|
|
isAnomaly = 0
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"'
|
|
)
|
|
|
|
# get future_tolerance
|
|
future_tolerance = record.get("future_tolerance", 0)
|
|
try:
|
|
future_tolerance = float(future_tolerance)
|
|
except:
|
|
future_tolerance = 0
|
|
|
|
#
|
|
# DSM Sampling
|
|
#
|
|
|
|
# call function dsm_sampling_lookup
|
|
dsm_sampling_lookup(
|
|
object_value,
|
|
sampling_collection_keys,
|
|
sampling_collection_dict,
|
|
record,
|
|
)
|
|
|
|
# check the value of allow_adaptive_delay (accepted values: true, false - as string)
|
|
allow_adaptive_delay = record.get("allow_adaptive_delay", "true")
|
|
if allow_adaptive_delay not in ["true", "false"]:
|
|
# log a warning
|
|
logging.warning(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", allow_adaptive_delay="{allow_adaptive_delay}" is not a valid value (accepted values: true, false), setting to "true"'
|
|
)
|
|
allow_adaptive_delay = "true"
|
|
# update the record
|
|
record["allow_adaptive_delay"] = "true"
|
|
|
|
# get actual primary KPI values
|
|
data_last_ingestion_lag_seen = record.get(
|
|
"data_last_ingestion_lag_seen", 0
|
|
)
|
|
if data_last_ingestion_lag_seen == "":
|
|
data_last_ingestion_lag_seen = 0
|
|
try:
|
|
data_last_ingestion_lag_seen = float(
|
|
data_last_ingestion_lag_seen
|
|
)
|
|
except:
|
|
data_last_ingestion_lag_seen = 0
|
|
data_last_lag_seen = record.get("data_last_lag_seen", 0)
|
|
|
|
# get per entity thresholds
|
|
data_max_lag_allowed = float(
|
|
record.get("data_max_lag_allowed", 0)
|
|
)
|
|
data_max_delay_allowed = float(
|
|
record.get("data_max_delay_allowed", 0)
|
|
)
|
|
min_dcount_threshold = record.get("min_dcount_threshold", 0)
|
|
try:
|
|
min_dcount_threshold = float(min_dcount_threshold)
|
|
except:
|
|
min_dcount_threshold = 0
|
|
|
|
# get dcount host related information
|
|
min_dcount_host = record.get("min_dcount_host", "any")
|
|
try:
|
|
min_dcount_host = float(min_dcount_host)
|
|
except:
|
|
pass
|
|
min_dcount_field = record.get("min_dcount_field", None)
|
|
|
|
# Get logical group information
|
|
|
|
# get logical group information: object_group_key
|
|
object_group_key = record.get("object_group_key", "")
|
|
|
|
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
|
|
object_logical_group_dict = logical_coll_dict.get(
|
|
object_group_key, {}
|
|
)
|
|
|
|
# get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime)
|
|
data_last_ingest = record.get("data_last_ingest", 0)
|
|
try:
|
|
data_last_ingest = float(data_last_ingest)
|
|
except:
|
|
pass
|
|
data_last_time_seen = record.get("data_last_time_seen", 0)
|
|
if data_last_time_seen == "":
|
|
data_last_time_seen = 0
|
|
try:
|
|
data_last_time_seen = float(data_last_time_seen)
|
|
except:
|
|
data_last_time_seen = 0
|
|
data_last_time_seen_idx = record.get(
|
|
"data_last_time_seen_idx", 0
|
|
)
|
|
try:
|
|
data_last_time_seen_idx = float(data_last_time_seen_idx)
|
|
except:
|
|
pass
|
|
|
|
# get monitoring time policy and rules (new fields)
|
|
monitoring_time_policy = record.get("monitoring_time_policy", None)
|
|
# if unset yet, use the tenant level and add to the record
|
|
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
|
|
monitoring_time_policy = default_monitoring_time_policy
|
|
record["monitoring_time_policy"] = default_monitoring_time_policy
|
|
monitoring_time_rules = record.get("monitoring_time_rules", None)
|
|
|
|
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
|
|
(
|
|
isUnderMonitoring,
|
|
monitoring_anomaly_reason,
|
|
isUnderMonitoringMsg,
|
|
) = get_monitoring_time_status(
|
|
monitoring_time_policy,
|
|
monitoring_time_rules,
|
|
)
|
|
|
|
# Get score data for this object_id (key_value) from scores_dict
|
|
score_data = scores_dict.get(key_value, {})
|
|
score = score_data.get("score", 0)
|
|
score_outliers = score_data.get("score_outliers", 0)
|
|
|
|
# call get_outliers_status and define isOutlier (with hybrid scoring)
|
|
isOutlier = get_outliers_status(
|
|
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
|
|
)
|
|
|
|
# call get_data_sampling_status and define isAnomaly
|
|
isAnomaly = get_data_sampling_status(
|
|
record.get("data_sample_status_colour"),
|
|
record.get("data_sample_feature"),
|
|
tenant_data_sampling_set_state,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isAnomaly="{isAnomaly}", tenant_data_sampling_set_state="{tenant_data_sampling_set_state}"'
|
|
)
|
|
|
|
# call get_future_status and define isFuture
|
|
(
|
|
isFuture,
|
|
isFutureMsg,
|
|
merged_future_tolerance,
|
|
) = get_future_status(
|
|
future_tolerance,
|
|
system_future_tolerance,
|
|
data_last_lag_seen,
|
|
data_last_ingestion_lag_seen,
|
|
data_last_time_seen,
|
|
data_last_ingest,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"'
|
|
)
|
|
|
|
# call get_is_under_dcount_host and define isUnderDcountHost
|
|
(
|
|
isUnderDcountHost,
|
|
isUnderDcountHostMsg,
|
|
) = get_is_under_dcount_host(
|
|
min_dcount_host, min_dcount_threshold, min_dcount_field
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDcountHost="{isUnderDcountHost}", isUnderDcountHostMsg="{isUnderDcountHostMsg}", min_dcount_host="{min_dcount_host}", min_dcount_threshold="{min_dcount_threshold}"'
|
|
)
|
|
|
|
# call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage
|
|
(
|
|
isUnderLatencyAlert,
|
|
isUnderLatencyMessage,
|
|
) = get_dsm_latency_status(
|
|
data_last_ingestion_lag_seen,
|
|
data_max_lag_allowed,
|
|
data_last_ingest,
|
|
data_last_time_seen,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
|
|
)
|
|
|
|
# call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage
|
|
(
|
|
isUnderDelayAlert,
|
|
isUnderDelayMessage,
|
|
) = get_dsm_delay_status(
|
|
data_last_lag_seen,
|
|
data_max_delay_allowed,
|
|
data_last_ingest,
|
|
data_last_time_seen,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
|
|
)
|
|
|
|
# call set_dsm_status and define object_state and anomaly_reason (with hybrid scoring)
|
|
(
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
) = set_dsm_status(
|
|
logging,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isAnomaly,
|
|
isFuture,
|
|
isFutureMsg,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
isUnderDcountHost,
|
|
isUnderDcountHostMsg,
|
|
object_logical_group_dict,
|
|
isUnderLatencyAlert,
|
|
isUnderLatencyMessage,
|
|
isUnderDelayAlert,
|
|
isUnderDelayMessage,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler="trackmedecisionmaker",
|
|
monitoring_anomaly_reason=monitoring_anomaly_reason,
|
|
score=score,
|
|
score_outliers=score_outliers,
|
|
vtenant_account=vtenant_conf,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, set_dsm_status, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
# insert our main fields
|
|
new_record["object_state"] = object_state
|
|
new_record["status_message"] = " | ".join(status_message)
|
|
new_record["status_message_json"] = status_message_json
|
|
new_record["anomaly_reason"] = "|".join(anomaly_reason)
|
|
|
|
# future tolerance
|
|
try:
|
|
new_record["future_tolerance"] = int(
|
|
round(merged_future_tolerance, 0)
|
|
)
|
|
except:
|
|
new_record["future_tolerance"] = -600
|
|
|
|
# convert data_last_time_seen to last_time from epoch
|
|
last_time = convert_epoch_to_datetime(data_last_time_seen)
|
|
new_record["last_time"] = last_time
|
|
|
|
# convert data_last_ingest to last_ingest from epoch
|
|
last_ingest = convert_epoch_to_datetime(data_last_ingest)
|
|
new_record["last_ingest"] = last_ingest
|
|
|
|
# convert data_last_time_seen_idx to last_time_idx from epoch
|
|
last_time_idx = convert_epoch_to_datetime(data_last_time_seen)
|
|
new_record["last_time_idx"] = last_time_idx
|
|
|
|
# get and convert latest_flip_time from epoch
|
|
latest_flip_time_human = record.get("latest_flip_time", 0)
|
|
try:
|
|
latest_flip_time_human = float(latest_flip_time_human)
|
|
except:
|
|
latest_flip_time_human = 0
|
|
new_record["latest_flip_time_human"] = (
|
|
convert_epoch_to_datetime(latest_flip_time_human)
|
|
)
|
|
|
|
# sla_timer
|
|
get_sla_timer(record, sla_classes, sla_default_class)
|
|
|
|
#
|
|
# splk-dhm
|
|
#
|
|
|
|
elif self.component == "dhm":
|
|
|
|
# first check blocklist
|
|
if (
|
|
datagen_collection_blocklist_not_regex_dict
|
|
or datagen_collection_blocklist_regex_dict
|
|
):
|
|
append_record = apply_blocklist(
|
|
record,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
)
|
|
|
|
if append_record:
|
|
|
|
# get splk_dhm_st_summary
|
|
splk_dhm_st_summary = record.get("splk_dhm_st_summary", None)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", splk_dhm_st_summary="{splk_dhm_st_summary}"'
|
|
)
|
|
|
|
# get outliers and data sampling
|
|
try:
|
|
isOutlier = int(record.get("isOutlier", 0))
|
|
except:
|
|
isOutlier = 0
|
|
|
|
try:
|
|
OutliersDisabled = int(record.get("OutliersDisabled", 0))
|
|
except:
|
|
OutliersDisabled = 0
|
|
|
|
try:
|
|
isAnomaly = int(record.get("isAnomaly", 0))
|
|
except:
|
|
isAnomaly = 0
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", isAnomaly="{isAnomaly}"'
|
|
)
|
|
|
|
# get future_tolerance
|
|
future_tolerance = record.get("future_tolerance", 0)
|
|
try:
|
|
future_tolerance = float(future_tolerance)
|
|
except:
|
|
future_tolerance = 0
|
|
|
|
# check the value of allow_adaptive_delay (accepted values: true, false - as string)
|
|
allow_adaptive_delay = record.get("allow_adaptive_delay", "true")
|
|
if allow_adaptive_delay not in ["true", "false"]:
|
|
# log a warning
|
|
logging.warning(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", allow_adaptive_delay="{allow_adaptive_delay}" is not a valid value (accepted values: true, false), setting to "true"'
|
|
)
|
|
allow_adaptive_delay = "true"
|
|
# update the record
|
|
record["allow_adaptive_delay"] = "true"
|
|
|
|
# get actual primary KPI values
|
|
data_last_ingestion_lag_seen = record.get(
|
|
"data_last_ingestion_lag_seen", 0
|
|
)
|
|
if data_last_ingestion_lag_seen == "":
|
|
data_last_ingestion_lag_seen = 0
|
|
try:
|
|
data_last_ingestion_lag_seen = float(
|
|
data_last_ingestion_lag_seen
|
|
)
|
|
except:
|
|
data_last_ingestion_lag_seen = 0
|
|
data_last_lag_seen = record.get("data_last_lag_seen", 0)
|
|
|
|
# get per entity thresholds
|
|
data_max_lag_allowed = float(
|
|
record.get("data_max_lag_allowed", 0)
|
|
)
|
|
data_max_delay_allowed = float(
|
|
record.get("data_max_delay_allowed", 0)
|
|
)
|
|
|
|
# Get logical group information
|
|
|
|
# get logical group information: object_group_key
|
|
object_group_key = record.get("object_group_key", "")
|
|
|
|
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
|
|
object_logical_group_dict = logical_coll_dict.get(
|
|
object_group_key, {}
|
|
)
|
|
|
|
# get data_last_ingest, data_last_time_seen, data_last_time_seen_idx (epochtime)
|
|
data_last_ingest = record.get("data_last_ingest", 0)
|
|
try:
|
|
data_last_ingest = float(data_last_ingest)
|
|
except:
|
|
pass
|
|
data_last_time_seen = record.get("data_last_time_seen", 0)
|
|
if data_last_time_seen == "":
|
|
data_last_time_seen = 0
|
|
try:
|
|
data_last_time_seen = float(data_last_time_seen)
|
|
except:
|
|
data_last_time_seen = 0
|
|
data_last_time_seen_idx = record.get(
|
|
"data_last_time_seen_idx", 0
|
|
)
|
|
try:
|
|
data_last_time_seen_idx = float(data_last_time_seen_idx)
|
|
except:
|
|
pass
|
|
|
|
# get monitoring time policy and rules (new fields)
|
|
monitoring_time_policy = record.get("monitoring_time_policy", None)
|
|
# if unset yet, use the tenant level and add to the record
|
|
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
|
|
monitoring_time_policy = default_monitoring_time_policy
|
|
record["monitoring_time_policy"] = default_monitoring_time_policy
|
|
monitoring_time_rules = record.get("monitoring_time_rules", None)
|
|
|
|
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
|
|
(
|
|
isUnderMonitoring,
|
|
monitoring_anomaly_reason,
|
|
isUnderMonitoringMsg,
|
|
) = get_monitoring_time_status(
|
|
monitoring_time_policy,
|
|
monitoring_time_rules,
|
|
)
|
|
|
|
# Get score data for this object_id (key_value) from scores_dict
|
|
score_data = scores_dict.get(key_value, {})
|
|
score = score_data.get("score", 0)
|
|
score_outliers = score_data.get("score_outliers", 0)
|
|
|
|
# call get_outliers_status and define isOutlier (with hybrid scoring)
|
|
isOutlier = get_outliers_status(
|
|
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
|
|
)
|
|
|
|
# call get_future_status and define isFuture
|
|
(
|
|
isFuture,
|
|
isFutureMsg,
|
|
merged_future_tolerance,
|
|
) = get_future_status(
|
|
future_tolerance,
|
|
system_future_tolerance,
|
|
data_last_lag_seen,
|
|
data_last_ingestion_lag_seen,
|
|
data_last_time_seen,
|
|
data_last_ingest,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", future_tolerance="{future_tolerance}", system_future_tolerance="{system_future_tolerance}", merged_future_tolerance="{merged_future_tolerance}", data_last_lag_seen="{data_last_lag_seen}", isFutureMsg="{isFutureMsg}"'
|
|
)
|
|
|
|
# call get_dsm_latency_status and define isUnderLatencyAlert and isUnderLatencyMessage
|
|
(
|
|
isUnderLatencyAlert,
|
|
isUnderLatencyMessage,
|
|
) = get_dsm_latency_status(
|
|
data_last_ingestion_lag_seen,
|
|
data_max_lag_allowed,
|
|
data_last_ingest,
|
|
data_last_time_seen,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderLatencyAlert="{isUnderLatencyAlert}", isUnderLatencyMessage="{isUnderLatencyMessage}", data_last_ingestion_lag_seen="{data_last_ingestion_lag_seen}", data_max_lag_allowed="{data_max_lag_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
|
|
)
|
|
|
|
# call get_dsm_delay_status and define isUnderDelayAlert and isUnderDelayMessage
|
|
(
|
|
isUnderDelayAlert,
|
|
isUnderDelayMessage,
|
|
) = get_dsm_delay_status(
|
|
data_last_lag_seen,
|
|
data_max_delay_allowed,
|
|
data_last_ingest,
|
|
data_last_time_seen,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isUnderDelayAlert="{isUnderDelayAlert}", isUnderDelayMessage="{isUnderDelayMessage}", data_last_lag_seen="{data_last_lag_seen}", data_max_delay_allowed="{data_max_delay_allowed}", data_last_ingest="{data_last_ingest}", data_last_time_seen="{data_last_time_seen}"'
|
|
)
|
|
|
|
# call set_dhm_status and define object_state and anomaly_reason (with hybrid scoring)
|
|
(
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
splk_dhm_alerting_policy,
|
|
) = set_dhm_status(
|
|
logging,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isFuture,
|
|
isFutureMsg,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
object_logical_group_dict,
|
|
isUnderLatencyAlert,
|
|
isUnderLatencyMessage,
|
|
isUnderDelayAlert,
|
|
isUnderDelayMessage,
|
|
default_splk_dhm_alerting_policy,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler="trackmedecisionmaker",
|
|
monitoring_anomaly_reason=monitoring_anomaly_reason,
|
|
score=score,
|
|
score_outliers=score_outliers,
|
|
vtenant_account=vtenant_conf,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
# insert our main fields
|
|
new_record["object_state"] = object_state
|
|
new_record["status_message"] = " | ".join(status_message)
|
|
new_record["status_message_json"] = status_message_json
|
|
new_record["anomaly_reason"] = "|".join(anomaly_reason)
|
|
|
|
# future tolerance
|
|
try:
|
|
new_record["future_tolerance"] = int(
|
|
round(merged_future_tolerance, 0)
|
|
)
|
|
except:
|
|
new_record["future_tolerance"] = -600
|
|
|
|
# specific for dhm
|
|
new_record["splk_dhm_alerting_policy"] = (
|
|
splk_dhm_alerting_policy
|
|
)
|
|
|
|
# convert data_last_time_seen to last_time from epoch
|
|
last_time = convert_epoch_to_datetime(data_last_time_seen)
|
|
new_record["last_time"] = last_time
|
|
|
|
# convert data_last_ingest to last_ingest from epoch
|
|
last_ingest = convert_epoch_to_datetime(data_last_ingest)
|
|
new_record["last_ingest"] = last_ingest
|
|
|
|
# convert data_last_time_seen_idx to last_time_idx from epoch
|
|
last_time_idx = convert_epoch_to_datetime(data_last_time_seen)
|
|
new_record["last_time_idx"] = last_time_idx
|
|
|
|
# get and convert latest_flip_time from epoch
|
|
latest_flip_time_human = record.get("latest_flip_time", 0)
|
|
try:
|
|
latest_flip_time_human = float(latest_flip_time_human)
|
|
except:
|
|
latest_flip_time_human = 0
|
|
new_record["latest_flip_time_human"] = (
|
|
convert_epoch_to_datetime(latest_flip_time_human)
|
|
)
|
|
|
|
# sla_timer
|
|
get_sla_timer(record, sla_classes, sla_default_class)
|
|
|
|
#
|
|
# splk-mhm
|
|
#
|
|
|
|
elif self.component == "mhm":
|
|
|
|
# first check blocklist
|
|
if (
|
|
datagen_collection_blocklist_not_regex_dict
|
|
or datagen_collection_blocklist_regex_dict
|
|
):
|
|
append_record = apply_blocklist(
|
|
record,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
)
|
|
|
|
if append_record:
|
|
|
|
# get metric_details
|
|
metric_details = record.get("metric_details", None)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", metric_details="{metric_details}"'
|
|
)
|
|
|
|
# Get logical group information
|
|
|
|
# get logical group information: object_group_key
|
|
object_group_key = record.get("object_group_key", "")
|
|
|
|
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
|
|
object_logical_group_dict = logical_coll_dict.get(
|
|
object_group_key, {}
|
|
)
|
|
|
|
# get metric_last_time_seen (epochtime)
|
|
metric_last_time_seen = record.get("metric_last_time_seen", 0)
|
|
try:
|
|
metric_last_time_seen = float(metric_last_time_seen)
|
|
except:
|
|
pass
|
|
|
|
# Get score data for this object_id (key_value) from scores_dict
|
|
score_data = scores_dict.get(key_value, {})
|
|
score = score_data.get("score", 0)
|
|
score_outliers = score_data.get("score_outliers", 0)
|
|
|
|
# call get_future_metrics_status and define isFuture
|
|
isFuture, isFutureMsg = get_future_metrics_status(
|
|
system_future_tolerance,
|
|
metric_last_time_seen,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isFuture="{isFuture}", system_future_tolerance="{system_future_tolerance}", metric_last_time_seen="{metric_last_time_seen}", isFutureMsg="{isFutureMsg}"'
|
|
)
|
|
|
|
# call set_mhm_status and define object_state and anomaly_reason (with hybrid scoring)
|
|
(
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
) = set_mhm_status(
|
|
logging,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
record,
|
|
metric_details,
|
|
isFuture,
|
|
isFutureMsg,
|
|
object_logical_group_dict,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler="trackmedecisionmaker",
|
|
score=score,
|
|
score_outliers=score_outliers,
|
|
vtenant_account=vtenant_conf,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
# insert our main fields
|
|
new_record["object_state"] = object_state
|
|
new_record["status_message"] = " | ".join(status_message)
|
|
new_record["status_message_json"] = status_message_json
|
|
new_record["anomaly_reason"] = "|".join(anomaly_reason)
|
|
|
|
# convert metric_last_time_seen to last_time from epoch
|
|
last_time = convert_epoch_to_datetime(metric_last_time_seen)
|
|
new_record["last_time"] = last_time
|
|
|
|
# get and convert latest_flip_time from epoch
|
|
latest_flip_time_human = record.get("latest_flip_time", 0)
|
|
try:
|
|
latest_flip_time_human = float(latest_flip_time_human)
|
|
except:
|
|
latest_flip_time_human = 0
|
|
new_record["latest_flip_time_human"] = (
|
|
convert_epoch_to_datetime(latest_flip_time_human)
|
|
)
|
|
|
|
# sla_timer
|
|
get_sla_timer(record, sla_classes, sla_default_class)
|
|
|
|
#
|
|
# splk-flx
|
|
#
|
|
|
|
# get record fields depending on the component
|
|
elif self.component == "flx":
|
|
|
|
# first check blocklist
|
|
if (
|
|
datagen_collection_blocklist_not_regex_dict
|
|
or datagen_collection_blocklist_regex_dict
|
|
):
|
|
append_record = apply_blocklist(
|
|
record,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
)
|
|
|
|
if append_record:
|
|
|
|
# get outliers
|
|
try:
|
|
isOutlier = int(record.get("isOutlier", 0))
|
|
except:
|
|
isOutlier = 0
|
|
|
|
try:
|
|
OutliersDisabled = int(record.get("OutliersDisabled", 0))
|
|
except:
|
|
OutliersDisabled = 0
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
|
|
)
|
|
|
|
# get monitoring time policy and rules (new fields)
|
|
monitoring_time_policy = record.get("monitoring_time_policy", None)
|
|
# if unset yet, use the tenant level and add to the record
|
|
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
|
|
monitoring_time_policy = default_monitoring_time_policy
|
|
record["monitoring_time_policy"] = default_monitoring_time_policy
|
|
monitoring_time_rules = record.get("monitoring_time_rules", None)
|
|
|
|
# Get logical group information
|
|
|
|
# get logical group information: object_group_key
|
|
object_group_key = record.get("object_group_key", "")
|
|
|
|
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
|
|
object_logical_group_dict = logical_coll_dict.get(
|
|
object_group_key, {}
|
|
)
|
|
|
|
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
|
|
(
|
|
isUnderMonitoring,
|
|
monitoring_anomaly_reason,
|
|
isUnderMonitoringMsg,
|
|
) = get_monitoring_time_status(
|
|
monitoring_time_policy,
|
|
monitoring_time_rules,
|
|
)
|
|
|
|
# Aggregate tracker-keyed JSON fields for concurrent trackers support
|
|
# Aggregate metrics: merge all trackers' metrics into a single dict
|
|
# This MUST be done BEFORE flx_check_dynamic_thresholds which expects aggregated metrics
|
|
if "metrics" in record:
|
|
try:
|
|
metrics_value = record.get("metrics")
|
|
if metrics_value:
|
|
metrics_by_tracker = None
|
|
|
|
# Parse if it's a JSON string
|
|
if isinstance(metrics_value, str):
|
|
try:
|
|
metrics_by_tracker = json.loads(metrics_value)
|
|
except (json.JSONDecodeError, TypeError):
|
|
# If parsing fails, might be old format, skip aggregation
|
|
pass
|
|
elif isinstance(metrics_value, dict):
|
|
metrics_by_tracker = metrics_value
|
|
|
|
if metrics_by_tracker and isinstance(metrics_by_tracker, dict):
|
|
# Check if it's tracker-keyed format (values are dicts) or old format (direct metrics dict)
|
|
aggregated_metrics = {}
|
|
is_tracker_keyed = False
|
|
|
|
for key, value in metrics_by_tracker.items():
|
|
if isinstance(value, dict):
|
|
# Check if value looks like metrics (has numeric/string values) or tracker data
|
|
# If all values in the nested dict are simple types, it's likely metrics
|
|
if all(isinstance(v, (int, float, str, bool)) or v is None for v in value.values()):
|
|
# This is tracker-keyed format, merge all trackers' metrics
|
|
aggregated_metrics.update(value)
|
|
is_tracker_keyed = True
|
|
else:
|
|
# Nested structure, might be tracker data
|
|
is_tracker_keyed = True
|
|
aggregated_metrics.update(value)
|
|
else:
|
|
# Simple value, old format
|
|
break
|
|
|
|
if is_tracker_keyed:
|
|
# Remove internal "status" field from aggregated metrics (not a user metric)
|
|
if "status" in aggregated_metrics:
|
|
del aggregated_metrics["status"]
|
|
|
|
# Update record with aggregated metrics as dict (for backward compatibility)
|
|
# Keep as dict since flx_check_dynamic_thresholds expects a dict
|
|
# Handle empty aggregated_metrics case (e.g., {"tracker1": {}})
|
|
record["metrics"] = aggregated_metrics
|
|
elif not is_tracker_keyed:
|
|
# Old format, keep as-is but ensure it's a dict and remove status field
|
|
if isinstance(metrics_value, str):
|
|
try:
|
|
old_metrics = json.loads(metrics_value)
|
|
if isinstance(old_metrics, dict) and "status" in old_metrics:
|
|
del old_metrics["status"]
|
|
record["metrics"] = old_metrics
|
|
except:
|
|
record["metrics"] = {}
|
|
else:
|
|
if isinstance(metrics_by_tracker, dict) and "status" in metrics_by_tracker:
|
|
metrics_by_tracker = metrics_by_tracker.copy()
|
|
del metrics_by_tracker["status"]
|
|
record["metrics"] = metrics_by_tracker
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'failed to aggregate metrics, exception="{str(e)}"'
|
|
)
|
|
|
|
# flx thresholds lookup
|
|
flx_thresholds_lookup(
|
|
object_value,
|
|
key_value,
|
|
record,
|
|
thresholds_collection_dict,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"'
|
|
)
|
|
|
|
# flx check dynamic thresholds
|
|
threshold_alert, threshold_messages, threshold_scores = (
|
|
flx_check_dynamic_thresholds(
|
|
logging,
|
|
record.get("dynamic_thresholds", {}),
|
|
record.get("metrics", {}),
|
|
)
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"'
|
|
)
|
|
|
|
# flx drilldown searches lookup
|
|
try:
|
|
flx_drilldown_searches_lookup(
|
|
self.tenant_id,
|
|
record.get("tracker_name", ""),
|
|
record.get("account", "local"),
|
|
record,
|
|
drilldown_searches_collection_dict,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, drilldown_search="{record.get("drilldown_search", "")}", drilldown_search_earliest="{record.get("drilldown_search_earliest", "")}", drilldown_search_latest="{record.get("drilldown_search_latest", "")}", drilldown_searches="{json.dumps(record.get("drilldown_searches", []), indent=2)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(f"instance_id={self.instance_id}, Error in flx_drilldown_searches_lookup: {str(e)}")
|
|
|
|
# flx default metrics lookup
|
|
try:
|
|
flx_default_metrics_lookup(
|
|
self.tenant_id,
|
|
record.get("tracker_name", ""),
|
|
record,
|
|
default_metrics_collection_dict,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, default_metric="{record.get("default_metric", "")}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(f"instance_id={self.instance_id}, Error in flx_default_metrics_lookup: {str(e)}")
|
|
|
|
# Get score data for this object_id (key_value) from scores_dict
|
|
score_data = scores_dict.get(key_value, {})
|
|
score = score_data.get("score", 0)
|
|
score_outliers = score_data.get("score_outliers", 0)
|
|
|
|
# call get_outliers_status and define isOutlier (with hybrid scoring)
|
|
isOutlier = get_outliers_status(
|
|
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
|
|
)
|
|
|
|
# Preserve tracker-keyed JSON for status, status_description and status_description_short
|
|
# We'll aggregate them temporarily for set_flx_status, then restore for proper merging in trackmepersistentfields
|
|
# IMPORTANT: The macro preserves status as status_preserved, but we need to check if it's tracker-keyed format
|
|
# If status_preserved exists and is tracker-keyed JSON, use it; otherwise check status field
|
|
status_tracker_keyed = None
|
|
status_desc_tracker_keyed = None
|
|
status_desc_short_tracker_keyed = None
|
|
|
|
# Check if macro preserved tracker-keyed format (status_preserved field)
|
|
# The macro preserves status before mvindex operation
|
|
if "status_preserved" in record:
|
|
status_preserved = record.get("status_preserved")
|
|
if isinstance(status_preserved, str):
|
|
try:
|
|
parsed = json.loads(status_preserved)
|
|
if isinstance(parsed, dict):
|
|
# It's tracker-keyed format from macro preservation
|
|
status_tracker_keyed = status_preserved
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
elif isinstance(status_preserved, dict):
|
|
status_tracker_keyed = json.dumps(status_preserved)
|
|
|
|
# If not found in preserved field, check status field directly
|
|
if not status_tracker_keyed and "status" in record:
|
|
status_raw = record.get("status")
|
|
# Check if it's already tracker-keyed format (JSON string or dict)
|
|
if isinstance(status_raw, str):
|
|
# Try to parse as JSON to verify it's tracker-keyed format
|
|
try:
|
|
parsed_status = json.loads(status_raw)
|
|
if isinstance(parsed_status, dict):
|
|
# It's tracker-keyed format, preserve it
|
|
status_tracker_keyed = status_raw
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Not valid JSON, might be old format
|
|
pass
|
|
elif isinstance(status_raw, dict):
|
|
# Already a dict (tracker-keyed format)
|
|
status_tracker_keyed = json.dumps(status_raw)
|
|
|
|
# Check if macro preserved tracker-keyed format (status_description_preserved field)
|
|
if "status_description_preserved" in record:
|
|
status_desc_preserved = record.get("status_description_preserved")
|
|
if isinstance(status_desc_preserved, str):
|
|
try:
|
|
parsed = json.loads(status_desc_preserved)
|
|
if isinstance(parsed, dict):
|
|
# It's tracker-keyed format from macro preservation
|
|
status_desc_tracker_keyed = status_desc_preserved
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Check if it contains " | " separator (already aggregated)
|
|
if " | " not in status_desc_preserved:
|
|
status_desc_tracker_keyed = status_desc_preserved
|
|
elif isinstance(status_desc_preserved, dict):
|
|
status_desc_tracker_keyed = json.dumps(status_desc_preserved)
|
|
|
|
# If not found in preserved field, check status_description field directly
|
|
if not status_desc_tracker_keyed and "status_description" in record:
|
|
status_desc_raw = record.get("status_description")
|
|
# Check if it's tracker-keyed format
|
|
if isinstance(status_desc_raw, str):
|
|
# Try to parse as JSON to verify it's tracker-keyed format
|
|
try:
|
|
parsed_desc = json.loads(status_desc_raw)
|
|
if isinstance(parsed_desc, dict):
|
|
# It's tracker-keyed format, preserve it
|
|
status_desc_tracker_keyed = status_desc_raw
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Check if it contains " | " separator (already aggregated)
|
|
if " | " not in status_desc_raw:
|
|
# Might be old format single string
|
|
status_desc_tracker_keyed = status_desc_raw
|
|
elif isinstance(status_desc_raw, dict):
|
|
# Already a dict (tracker-keyed format)
|
|
status_desc_tracker_keyed = json.dumps(status_desc_raw)
|
|
|
|
# Check if macro preserved tracker-keyed format (status_description_short_preserved field)
|
|
if "status_description_short_preserved" in record:
|
|
status_desc_short_preserved = record.get("status_description_short_preserved")
|
|
if isinstance(status_desc_short_preserved, str):
|
|
try:
|
|
parsed = json.loads(status_desc_short_preserved)
|
|
if isinstance(parsed, dict):
|
|
# It's tracker-keyed format from macro preservation
|
|
status_desc_short_tracker_keyed = status_desc_short_preserved
|
|
except (json.JSONDecodeError, TypeError):
|
|
if " | " not in status_desc_short_preserved:
|
|
status_desc_short_tracker_keyed = status_desc_short_preserved
|
|
elif isinstance(status_desc_short_preserved, dict):
|
|
status_desc_short_tracker_keyed = json.dumps(status_desc_short_preserved)
|
|
|
|
# If not found in preserved field, check status_description_short field directly
|
|
if not status_desc_short_tracker_keyed and "status_description_short" in record:
|
|
status_desc_short_raw = record.get("status_description_short")
|
|
# Similar logic as status_description
|
|
if isinstance(status_desc_short_raw, str):
|
|
try:
|
|
parsed_desc_short = json.loads(status_desc_short_raw)
|
|
if isinstance(parsed_desc_short, dict):
|
|
status_desc_short_tracker_keyed = status_desc_short_raw
|
|
except (json.JSONDecodeError, TypeError):
|
|
if " | " not in status_desc_short_raw:
|
|
status_desc_short_tracker_keyed = status_desc_short_raw
|
|
elif isinstance(status_desc_short_raw, dict):
|
|
status_desc_short_tracker_keyed = json.dumps(status_desc_short_raw)
|
|
|
|
# Aggregate status temporarily for set_flx_status: worst-status logic (2 > 3 > 1)
|
|
if "status" in record:
|
|
try:
|
|
status_str = record.get("status")
|
|
if status_str:
|
|
aggregated_status = None
|
|
|
|
if isinstance(status_str, str):
|
|
try:
|
|
status_by_tracker = json.loads(status_str)
|
|
if isinstance(status_by_tracker, dict):
|
|
# Tracker-keyed format: apply worst-status logic
|
|
status_values = list(status_by_tracker.values())
|
|
if status_values:
|
|
# Worst-status logic: 2 (red) > 3 (orange) > 1 (green)
|
|
if 2 in status_values:
|
|
aggregated_status = 2 # Red
|
|
elif 3 in status_values:
|
|
aggregated_status = 3 # Orange
|
|
else:
|
|
aggregated_status = 1 # Green (all are 1)
|
|
except (json.JSONDecodeError, TypeError):
|
|
# If parsing fails, might be old format integer string
|
|
try:
|
|
aggregated_status = int(status_str)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
elif isinstance(status_str, dict):
|
|
# Already a dict, apply worst-status logic
|
|
status_values = list(status_str.values())
|
|
if status_values:
|
|
if 2 in status_values:
|
|
aggregated_status = 2 # Red
|
|
elif 3 in status_values:
|
|
aggregated_status = 3 # Orange
|
|
else:
|
|
aggregated_status = 1 # Green
|
|
elif isinstance(status_str, int):
|
|
# Old format integer, use as-is
|
|
aggregated_status = status_str
|
|
|
|
# Temporarily update record with aggregated status for set_flx_status
|
|
if aggregated_status is not None:
|
|
record["status"] = aggregated_status
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'failed to aggregate status, exception="{str(e)}"'
|
|
)
|
|
|
|
# Determine number of trackers to decide if we need prefix
|
|
num_trackers = 1
|
|
if "tracker_name" in record:
|
|
try:
|
|
tracker_name_value = record.get("tracker_name")
|
|
if tracker_name_value:
|
|
if isinstance(tracker_name_value, str):
|
|
try:
|
|
tracker_names = json.loads(tracker_name_value)
|
|
if isinstance(tracker_names, list):
|
|
num_trackers = len(tracker_names)
|
|
except (json.JSONDecodeError, TypeError):
|
|
# If parsing fails, might be comma-separated string
|
|
if "," in tracker_name_value:
|
|
num_trackers = len([t.strip() for t in tracker_name_value.split(",")])
|
|
elif isinstance(tracker_name_value, list):
|
|
num_trackers = len(tracker_name_value)
|
|
except Exception:
|
|
pass
|
|
|
|
# Aggregate status_description temporarily for set_flx_status: concatenate all trackers' descriptions
|
|
if "status_description" in record:
|
|
try:
|
|
status_desc_str = record.get("status_description")
|
|
if status_desc_str:
|
|
if isinstance(status_desc_str, str):
|
|
try:
|
|
status_desc_by_tracker = json.loads(status_desc_str)
|
|
if isinstance(status_desc_by_tracker, dict):
|
|
# Check if it's tracker-keyed format
|
|
status_descriptions = []
|
|
for tracker_name, desc in status_desc_by_tracker.items():
|
|
if desc:
|
|
# Only add prefix if multiple trackers
|
|
if num_trackers > 1:
|
|
status_descriptions.append(f"{tracker_name}: {desc}")
|
|
else:
|
|
status_descriptions.append(desc)
|
|
|
|
if status_descriptions:
|
|
# Temporarily update record with aggregated status_description for set_flx_status
|
|
record["status_description"] = " | ".join(status_descriptions)
|
|
else:
|
|
# Empty, keep as-is
|
|
pass
|
|
except (json.JSONDecodeError, TypeError):
|
|
# If parsing fails, might be old format string, keep as-is
|
|
pass
|
|
elif isinstance(status_desc_str, dict):
|
|
# Already a dict, aggregate
|
|
status_descriptions = []
|
|
for tracker_name, desc in status_desc_str.items():
|
|
if desc:
|
|
# Only add prefix if multiple trackers
|
|
if num_trackers > 1:
|
|
status_descriptions.append(f"{tracker_name}: {desc}")
|
|
else:
|
|
status_descriptions.append(desc)
|
|
|
|
if status_descriptions:
|
|
# Temporarily update record with aggregated status_description for set_flx_status
|
|
record["status_description"] = " | ".join(status_descriptions)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'failed to aggregate status_description, exception="{str(e)}"'
|
|
)
|
|
|
|
# Aggregate status_description_short temporarily for set_flx_status: concatenate all trackers' descriptions
|
|
if "status_description_short" in record:
|
|
try:
|
|
status_desc_short_str = record.get("status_description_short")
|
|
if status_desc_short_str:
|
|
if isinstance(status_desc_short_str, str):
|
|
try:
|
|
status_desc_short_by_tracker = json.loads(status_desc_short_str)
|
|
if isinstance(status_desc_short_by_tracker, dict):
|
|
# Check if it's tracker-keyed format
|
|
status_descriptions_short = []
|
|
for tracker_name, desc in status_desc_short_by_tracker.items():
|
|
if desc:
|
|
# Only add prefix if multiple trackers
|
|
if num_trackers > 1:
|
|
status_descriptions_short.append(f"{tracker_name}: {desc}")
|
|
else:
|
|
status_descriptions_short.append(desc)
|
|
|
|
if status_descriptions_short:
|
|
# Temporarily update record with aggregated status_description_short for set_flx_status
|
|
record["status_description_short"] = " | ".join(status_descriptions_short)
|
|
except (json.JSONDecodeError, TypeError):
|
|
# If parsing fails, might be old format string, keep as-is
|
|
pass
|
|
elif isinstance(status_desc_short_str, dict):
|
|
# Already a dict, aggregate
|
|
status_descriptions_short = []
|
|
for tracker_name, desc in status_desc_short_str.items():
|
|
if desc:
|
|
# Only add prefix if multiple trackers
|
|
if num_trackers > 1:
|
|
status_descriptions_short.append(f"{tracker_name}: {desc}")
|
|
else:
|
|
status_descriptions_short.append(desc)
|
|
|
|
if status_descriptions_short:
|
|
# Temporarily update record with aggregated status_description_short for set_flx_status
|
|
record["status_description_short"] = " | ".join(status_descriptions_short)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'failed to aggregate status_description_short, exception="{str(e)}"'
|
|
)
|
|
|
|
# Generate per-tracker status messages before calling set_flx_status
|
|
# This allows us to store individual messages per tracker in status_message_json
|
|
per_tracker_status_messages = []
|
|
# Only generate per-tracker messages if we have valid tracker-keyed data
|
|
# Both status and status_description must be tracker-keyed format (JSON strings that parse to dicts)
|
|
if status_tracker_keyed and status_desc_tracker_keyed:
|
|
try:
|
|
# Parse tracker-keyed status and status_description
|
|
status_by_tracker = None
|
|
status_desc_by_tracker = None
|
|
|
|
if isinstance(status_tracker_keyed, str):
|
|
try:
|
|
status_by_tracker = json.loads(status_tracker_keyed)
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
elif isinstance(status_tracker_keyed, dict):
|
|
status_by_tracker = status_tracker_keyed
|
|
|
|
if isinstance(status_desc_tracker_keyed, str):
|
|
try:
|
|
status_desc_by_tracker = json.loads(status_desc_tracker_keyed)
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
elif isinstance(status_desc_tracker_keyed, dict):
|
|
status_desc_by_tracker = status_desc_tracker_keyed
|
|
|
|
# Generate status message for each tracker
|
|
if isinstance(status_by_tracker, dict) and isinstance(status_desc_by_tracker, dict):
|
|
# Verify we have tracker-keyed data (dict with multiple keys)
|
|
if len(status_by_tracker) > 0 and len(status_desc_by_tracker) > 0:
|
|
# Sort tracker names for consistent ordering
|
|
sorted_tracker_names = sorted(status_by_tracker.keys())
|
|
for tracker_name in sorted_tracker_names:
|
|
tracker_status = status_by_tracker.get(tracker_name)
|
|
tracker_status_desc = status_desc_by_tracker.get(tracker_name, "unknown")
|
|
|
|
if tracker_status is None:
|
|
continue
|
|
|
|
# Skip if status_description contains " | " (already aggregated)
|
|
if isinstance(tracker_status_desc, str) and " | " in tracker_status_desc:
|
|
logging.warning(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'tracker="{tracker_name}" has aggregated status_description, skipping per-tracker message generation'
|
|
)
|
|
continue
|
|
|
|
try:
|
|
tracker_status_int = int(tracker_status)
|
|
except (ValueError, TypeError):
|
|
tracker_status_int = 1
|
|
|
|
# Generate status message for this tracker (same format as set_flx_status)
|
|
# Use only this tracker's description, not the aggregated one
|
|
# Only add prefix if multiple trackers
|
|
if num_trackers > 1:
|
|
status_desc_with_prefix = f"{tracker_name}: {tracker_status_desc}"
|
|
else:
|
|
status_desc_with_prefix = tracker_status_desc
|
|
|
|
if tracker_status_int == 1:
|
|
tracker_msg = f"The entity status is complying with monitoring rules (status: {tracker_status_int}, status_description: {status_desc_with_prefix})"
|
|
else:
|
|
tracker_msg = f"The entity status is not complying with monitoring rules (status: {tracker_status_int}, status_description: {status_desc_with_prefix})"
|
|
|
|
per_tracker_status_messages.append(tracker_msg)
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'generated {len(per_tracker_status_messages)} per-tracker status messages from {len(sorted_tracker_names)} trackers'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'tracker-keyed data is empty, cannot generate per-tracker messages'
|
|
)
|
|
else:
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'tracker-keyed data is not in expected format: status_by_tracker={type(status_by_tracker)}, status_desc_by_tracker={type(status_desc_by_tracker)}'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'failed to generate per-tracker status messages, exception="{str(e)}"'
|
|
)
|
|
|
|
# call set_flx_status and define object_state and anomaly_reason (with hybrid scoring)
|
|
(
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
) = set_flx_status(
|
|
logging,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
object_logical_group_dict,
|
|
threshold_alert,
|
|
threshold_messages,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler="trackmedecisionmaker",
|
|
score=score,
|
|
score_outliers=score_outliers,
|
|
threshold_scores=threshold_scores,
|
|
vtenant_account=vtenant_conf,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
# Replace status_message_json["status_message"] with per-tracker messages if available
|
|
# Otherwise keep the aggregated message from set_flx_status
|
|
if per_tracker_status_messages:
|
|
# Use per-tracker messages for better visibility
|
|
# Each tracker gets its own message in the array
|
|
status_message_json["status_message"] = per_tracker_status_messages
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'replaced status_message_json with {len(per_tracker_status_messages)} per-tracker messages'
|
|
)
|
|
else:
|
|
# If no per-tracker messages were generated (e.g., old format), keep the aggregated message
|
|
# This ensures backward compatibility
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", '
|
|
f'using aggregated status_message from set_flx_status (no per-tracker messages generated)'
|
|
)
|
|
|
|
# Restore tracker-keyed JSON for status, status_description and status_description_short
|
|
# This ensures proper merging in trackmepersistentfields
|
|
if status_tracker_keyed is not None:
|
|
record["status"] = status_tracker_keyed
|
|
if status_desc_tracker_keyed is not None:
|
|
record["status_description"] = status_desc_tracker_keyed
|
|
if status_desc_short_tracker_keyed is not None:
|
|
record["status_description_short"] = status_desc_short_tracker_keyed
|
|
|
|
# insert our main fields
|
|
new_record["object_state"] = object_state
|
|
new_record["status_message"] = " | ".join(status_message)
|
|
new_record["status_message_json"] = status_message_json
|
|
new_record["anomaly_reason"] = "|".join(anomaly_reason)
|
|
|
|
# get and convert latest_flip_time from epoch
|
|
latest_flip_time_human = record.get("latest_flip_time", 0)
|
|
try:
|
|
latest_flip_time_human = float(latest_flip_time_human)
|
|
except:
|
|
latest_flip_time_human = 0
|
|
new_record["latest_flip_time_human"] = (
|
|
convert_epoch_to_datetime(latest_flip_time_human)
|
|
)
|
|
|
|
# sla_timer
|
|
get_sla_timer(record, sla_classes, sla_default_class)
|
|
|
|
# specific to flx, generate the status metric
|
|
try:
|
|
trackme_flx_gen_metrics(
|
|
record.get("_time", time.time()),
|
|
self.tenant_id,
|
|
object_value,
|
|
key_value,
|
|
metric_index,
|
|
json.dumps({"status": int(record.get("status", 1))}),
|
|
)
|
|
except Exception as e:
|
|
error_msg = f'instance_id={self.instance_id}, Failed to call trackme_flx_gen_metrics with exception="{str(e)}"'
|
|
logging.error(error_msg)
|
|
|
|
#
|
|
# splk-fqm
|
|
#
|
|
|
|
# get record fields depending on the component
|
|
elif self.component == "fqm":
|
|
|
|
# first check blocklist
|
|
if (
|
|
datagen_collection_blocklist_not_regex_dict
|
|
or datagen_collection_blocklist_regex_dict
|
|
):
|
|
append_record = apply_blocklist(
|
|
record,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
)
|
|
|
|
if append_record:
|
|
|
|
# get outliers
|
|
try:
|
|
isOutlier = int(record.get("isOutlier", 0))
|
|
except:
|
|
isOutlier = 0
|
|
|
|
try:
|
|
OutliersDisabled = int(record.get("OutliersDisabled", 0))
|
|
except:
|
|
OutliersDisabled = 0
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
|
|
)
|
|
|
|
# get monitoring time policy and rules (new fields)
|
|
monitoring_time_policy = record.get("monitoring_time_policy", None)
|
|
# if unset yet, use the tenant level and add to the record
|
|
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
|
|
monitoring_time_policy = default_monitoring_time_policy
|
|
record["monitoring_time_policy"] = default_monitoring_time_policy
|
|
monitoring_time_rules = record.get("monitoring_time_rules", None)
|
|
|
|
# Get logical group information
|
|
|
|
# get logical group information: object_group_key
|
|
object_group_key = record.get("object_group_key", "")
|
|
|
|
# from logical_coll_dict, get object_logical_group_dict by object_group_key, this is sent to the status function
|
|
object_logical_group_dict = logical_coll_dict.get(
|
|
object_group_key, {}
|
|
)
|
|
|
|
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
|
|
(
|
|
isUnderMonitoring,
|
|
monitoring_anomaly_reason,
|
|
isUnderMonitoringMsg,
|
|
) = get_monitoring_time_status(
|
|
monitoring_time_policy,
|
|
monitoring_time_rules,
|
|
)
|
|
|
|
# fqm thresholds lookup
|
|
fqm_thresholds_lookup(
|
|
object_value,
|
|
key_value,
|
|
record,
|
|
thresholds_collection_dict,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}"'
|
|
)
|
|
|
|
# fqm check dynamic thresholds
|
|
threshold_alert, threshold_messages, threshold_scores = (
|
|
fqm_check_dynamic_thresholds(
|
|
logging,
|
|
record.get("dynamic_thresholds", {}),
|
|
record.get("metrics", {}),
|
|
)
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, object_value="{object_value}", key_value="{key_value}", threshold_alert="{threshold_alert}", threshold_messages="{threshold_messages}", dynamic_thresholds="{json.dumps(record.get("dynamic_thresholds", {}), indent=2)}", metrics_record="{json.dumps(record.get("metrics", {}), indent=2)}"'
|
|
)
|
|
|
|
# Get score data for this object_id (key_value) from scores_dict
|
|
score_data = scores_dict.get(key_value, {})
|
|
score = score_data.get("score", 0)
|
|
score_outliers = score_data.get("score_outliers", 0)
|
|
|
|
# call get_outliers_status and define isOutlier (with hybrid scoring)
|
|
isOutlier = get_outliers_status(
|
|
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
|
|
)
|
|
|
|
# call set_fqm_status and define object_state and anomaly_reason (with hybrid scoring)
|
|
(
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
) = set_fqm_status(
|
|
logging,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
object_logical_group_dict,
|
|
threshold_alert,
|
|
threshold_messages,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler="trackmedecisionmaker",
|
|
score=score,
|
|
score_outliers=score_outliers,
|
|
threshold_scores=threshold_scores,
|
|
vtenant_account=vtenant_conf,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
# insert our main fields
|
|
new_record["object_state"] = object_state
|
|
new_record["status_message"] = " | ".join(status_message)
|
|
new_record["status_message_json"] = status_message_json
|
|
new_record["anomaly_reason"] = "|".join(anomaly_reason)
|
|
|
|
# get and convert latest_flip_time from epoch
|
|
latest_flip_time_human = record.get("latest_flip_time", 0)
|
|
try:
|
|
latest_flip_time_human = float(latest_flip_time_human)
|
|
except:
|
|
latest_flip_time_human = 0
|
|
new_record["latest_flip_time_human"] = (
|
|
convert_epoch_to_datetime(latest_flip_time_human)
|
|
)
|
|
|
|
# sla_timer
|
|
get_sla_timer(record, sla_classes, sla_default_class)
|
|
|
|
# specific to fqm, generate the status metric
|
|
try:
|
|
trackme_fqm_gen_metrics(
|
|
record.get("_time", time.time()),
|
|
self.tenant_id,
|
|
object_value,
|
|
key_value,
|
|
metric_index,
|
|
json.dumps({"status": int(record.get("status", 1))}),
|
|
)
|
|
except Exception as e:
|
|
error_msg = f'instance_id={self.instance_id}, Failed to call trackme_fqm_gen_metrics with exception="{str(e)}"'
|
|
logging.error(error_msg)
|
|
|
|
#
|
|
# splk-wlk
|
|
#
|
|
|
|
# get record fields depending on the component
|
|
elif self.component == "wlk":
|
|
|
|
# first check blocklist
|
|
if (
|
|
datagen_collection_blocklist_not_regex_dict
|
|
or datagen_collection_blocklist_regex_dict
|
|
):
|
|
append_record = apply_blocklist(
|
|
record,
|
|
datagen_collection_blocklist_not_regex_dict,
|
|
datagen_collection_blocklist_regex_dict,
|
|
)
|
|
|
|
if append_record:
|
|
|
|
# get outliers
|
|
try:
|
|
isOutlier = int(record.get("isOutlier", 0))
|
|
except:
|
|
isOutlier = 0
|
|
|
|
try:
|
|
OutliersDisabled = int(record.get("OutliersDisabled", 0))
|
|
except:
|
|
OutliersDisabled = 0
|
|
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}"'
|
|
)
|
|
|
|
# get monitoring time policy and rules (new fields)
|
|
monitoring_time_policy = record.get("monitoring_time_policy", None)
|
|
# if unset yet, use the tenant level and add to the record
|
|
if monitoring_time_policy is None or len(monitoring_time_policy) == 0:
|
|
monitoring_time_policy = default_monitoring_time_policy
|
|
record["monitoring_time_policy"] = default_monitoring_time_policy
|
|
monitoring_time_rules = record.get("monitoring_time_rules", None)
|
|
|
|
# call get_monitoring_time_status and define isUnderMonitoring, monitoring_anomaly_reason, isUnderMonitoringMsg
|
|
(
|
|
isUnderMonitoring,
|
|
monitoring_anomaly_reason,
|
|
isUnderMonitoringMsg,
|
|
) = get_monitoring_time_status(
|
|
monitoring_time_policy,
|
|
monitoring_time_rules,
|
|
)
|
|
|
|
# Get score data for this object_id (key_value) from scores_dict
|
|
score_data = scores_dict.get(key_value, {})
|
|
score = score_data.get("score", 0)
|
|
score_outliers = score_data.get("score_outliers", 0)
|
|
|
|
# call get_outliers_status and define isOutlier (with hybrid scoring)
|
|
isOutlier = get_outliers_status(
|
|
isOutlier, OutliersDisabled, tenant_outliers_set_state, score_outliers=score_outliers
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", isOutlier="{isOutlier}", OutliersDisabled="{OutliersDisabled}", tenant_outliers_set_state="{tenant_outliers_set_state}", score_outliers="{score_outliers}"'
|
|
)
|
|
|
|
# call set_wlk_status and define object_state and anomaly_reason (with hybrid scoring)
|
|
(
|
|
object_state,
|
|
status_message,
|
|
status_message_json,
|
|
anomaly_reason,
|
|
) = set_wlk_status(
|
|
logging,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
record,
|
|
isOutlier,
|
|
isUnderMonitoring,
|
|
isUnderMonitoringMsg,
|
|
disruption_queue_collection,
|
|
disruption_queue_record,
|
|
source_handler="trackmedecisionmaker",
|
|
monitoring_anomaly_reason=monitoring_anomaly_reason,
|
|
score=score,
|
|
score_outliers=score_outliers,
|
|
vtenant_account=vtenant_conf,
|
|
)
|
|
logging.debug(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", object_value="{object_value}", key_value="{key_value}", object_state="{object_state}", status_message="{status_message}", anomaly_reason="{anomaly_reason}"'
|
|
)
|
|
|
|
# insert our main fields
|
|
new_record["object_state"] = object_state
|
|
new_record["status_message"] = " | ".join(status_message)
|
|
new_record["status_message_json"] = status_message_json
|
|
new_record["anomaly_reason"] = "|".join(anomaly_reason)
|
|
|
|
# get and convert latest_flip_time from epoch
|
|
latest_flip_time_human = record.get("latest_flip_time", 0)
|
|
try:
|
|
latest_flip_time_human = float(latest_flip_time_human)
|
|
except:
|
|
latest_flip_time_human = 0
|
|
new_record["latest_flip_time_human"] = (
|
|
convert_epoch_to_datetime(latest_flip_time_human)
|
|
)
|
|
|
|
# sla_timer
|
|
get_sla_timer(record, sla_classes, sla_default_class)
|
|
|
|
#
|
|
# End per component processing
|
|
#
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={self.instance_id}, tenant_id="{self.tenant_id}", component="{self.component}", Error processing record, record="{json.dumps(record, indent=2)}", exception="{str(e)}"'
|
|
)
|
|
continue # Proceed with next record
|
|
|
|
#
|
|
# End per component processing
|
|
#
|
|
|
|
if append_record:
|
|
|
|
# add all key value pairs from the original record to new_record if not present already
|
|
for key, value in record.items():
|
|
if key not in new_record:
|
|
new_record[key] = value
|
|
|
|
# add new_record to processed_records
|
|
processed_records.append(new_record)
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
#
|
|
# Render
|
|
#
|
|
|
|
# set task
|
|
#
|
|
task_start = time.time()
|
|
task_instance_id = get_uuid()
|
|
task_name = "render_records"
|
|
|
|
for yield_record in self.generate_fields(processed_records):
|
|
# logging
|
|
logging.debug(f'instance_id={self.instance_id}, yield_record="{json.dumps(yield_record, indent=2)}"')
|
|
|
|
# yield record
|
|
yield yield_record
|
|
|
|
# end task
|
|
#
|
|
task_end = time.time()
|
|
task_run_time = round((task_end - task_start), 3)
|
|
logging.info(
|
|
f'instance_id={self.instance_id}, task="{task_name}", task_instance_id={task_instance_id}, task_run_time="{task_run_time}", task_end=1, task has terminated.'
|
|
)
|
|
|
|
# performance counter
|
|
logging.info(
|
|
f'trackmedecisionmaker has terminated, tenant_id="{self.tenant_id}", component="{self.component}", instance_id="{self.instance_id}", upstream_records="{records_count}", processed_records="{len(processed_records)}", run_time="{round(time.time() - start, 3)}"'
|
|
)
|
|
|
|
|
|
dispatch(TrackMeDecisionMaker, sys.argv, sys.stdin, sys.stdout, __name__)
|