You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmesplkoutlierstrackerh...

1084 lines
57 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import os
import sys
import time
import json
import hashlib
# Logging imports
import logging
from logging.handlers import RotatingFileHandler
# Networking imports
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
"%s/var/log/splunk/trackme_splk_outliers_tracker_helper.log" % splunkhome,
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# import Splunk libs
from splunklib.searchcommands import (
dispatch,
GeneratingCommand,
Configuration,
Option,
validators,
)
# import trackme libs
from trackme_libs import (
trackme_reqinfo,
trackme_vtenant_account,
trackme_vtenant_component_info,
trackme_register_tenant_object_summary,
run_splunk_search,
trackme_handler_events,
trackme_idx_for_tenant,
trackme_gen_state,
)
# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces, decode_unicode
# import trackme libs croniter
from trackme_libs_croniter import cron_to_seconds
# import trackme libs scoring
from trackme_libs_scoring import trackme_scoring_gen_metrics
@Configuration(distributed=False)
class SplkOutliersTracker(GeneratingCommand):
tenant_id = Option(
doc="""
**Syntax:** **tenant_id=****
**Description:** The tenant identifier.""",
require=True,
default=None,
)
component = Option(
doc="""
**Syntax:** **component=****
**Description:** The component category.""",
require=True,
default=None,
validate=validators.Match("component", r"^(?:dsm|dhm|flx|fqm|wlk)$"),
)
object = Option(
doc="""
**Syntax:** **object=****
**Description:** Optional, The value for object.""",
require=False,
default="*",
validate=validators.Match("object", r"^.*$"),
)
object_id = Option(
doc="""
**Syntax:** **object_id=****
**Description:** Optional, The value for object id.""",
require=False,
default="*",
validate=validators.Match("object_id", r"^.*$"),
)
max_runtime = Option(
doc="""
**Syntax:** **max_runtime=****
**Description:** Optional, The max value in seconds for the total runtime of the job, defaults to 900 (15 min) which is substracted by 120 sec of margin. Once the job reaches this, it gets terminated""",
require=False,
default="900",
validate=validators.Match("object", r"^\d*$"),
)
def _get_log_object_ref(self, object_value=None, object_id_value=None):
"""Helper function to get object reference for logging (includes object_id when available)."""
object_id_ref = f'object_id="{object_id_value}"' if object_id_value else ""
object_ref = f'object="{object_value}"' if object_value else ""
if object_id_ref and object_ref:
return f'{object_id_ref}, {object_ref}'
elif object_id_ref:
return object_id_ref
elif object_ref:
return object_ref
else:
return 'object="*"'
force_run = Option(
doc="""
**Syntax:** **force_run=****
**Description:** Optional, force run monitor, if set to True we will not honour the minimal time betwen two monitor execution.""",
require=False,
default="False",
validate=validators.Match("force_run", r"^(?:True|False)$"),
)
allow_auto_train = Option(
doc="""
**Syntax:** **allow_auto_train=****
**Description:** Allows automated ML training if not trained since more than system wide parameter.""",
require=False,
default=True,
)
def generate(self, **kwargs):
# performance counter
start = time.time()
# Track execution times
execution_times = []
average_execution_time = 0
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
)
log.setLevel(reqinfo["logging_level"])
# Get Virtual Tenant account
vtenant_account = trackme_vtenant_account(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
# get vtenant component info
vtenant_component_info = trackme_vtenant_component_info(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
logging.debug(
f'vtenant_component_info="{json.dumps(vtenant_component_info, indent=2)}"'
)
# Get the tenant indexes
tenant_indexes = trackme_idx_for_tenant(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
# check schema version migration state
try:
schema_version = int(vtenant_component_info["schema_version"])
schema_version_upgrade_in_progress = bool(
int(vtenant_component_info["schema_version_upgrade_in_progress"])
)
logging.debug(
f'schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"'
)
except Exception as e:
schema_version = 0
schema_version_upgrade_in_progress = False
logging.error(
f'failed to retrieve schema_version_upgrade_in_progress=, exception="{str(e)}"'
)
# Do not proceed if the schema version upgrade is in progress
if schema_version_upgrade_in_progress:
yield_json = {
"_time": time.time(),
"tenant_id": self.tenant_id,
"component": self.component,
"response": f'tenant_id="{self.tenant_id}", schema upgrade is currently in progress, we will wait until the process is completed before proceeding, the schema upgrade is handled by the health_tracker of the tenant and is completed once the schema_version field of the Virtual Tenants KVstore (trackme_virtual_tenants) matches TrackMe\'s version, schema_version="{schema_version}", schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"',
"schema_version": schema_version,
"schema_version_upgrade_in_progress": schema_version_upgrade_in_progress,
}
logging.info(json.dumps(yield_json, indent=2))
yield {
"_time": yield_json["_time"],
"_raw": yield_json,
}
# Default to True (ML Outliers enabled)
outliers_feature_enabled = True
# Define the valid components
valid_components = {"dsm", "dhm", "flx", "fqm", "wlk"}
# Construct the key dynamically
key = f"mloutliers_{self.component}"
# Check if the component is valid and handle exceptions
if self.component in valid_components:
try:
if int(vtenant_account.get(key, 1)) == 0:
outliers_feature_enabled = False
except (ValueError, TypeError):
outliers_feature_enabled = True
if not outliers_feature_enabled or schema_version_upgrade_in_progress:
if not outliers_feature_enabled:
# yield and log
results_dict = {
"tenant_id": self.tenant_id,
"action": "success",
"results": "ML Anomaly Detection feature is disabled for the tenant and component, no action taken",
"vtenant_account": vtenant_account,
}
yield {"_time": time.time(), "_raw": results_dict}
else: # procees
# Get app level config
splk_outliers_time_monitor_mlmodels_default = reqinfo["trackme_conf"][
"splk_outliers_detection"
]["splk_outliers_time_monitor_mlmodels_default"]
# counter
count = 0
# scoring metrics records
scoring_metrics_records = []
# Get the session key
session_key = self._metadata.searchinfo.session_key
# Outliers rules storage collection
collection_rules_name = f"kv_trackme_{self.component}_outliers_entity_rules_tenant_{str(self.tenant_id)}"
collection_rule = self.service.kvstore[collection_rules_name]
# Outliers data storage collection
collection_data_name = f"kv_trackme_{self.component}_outliers_entity_data_tenant_{str(self.tenant_id)}"
collection_data = self.service.kvstore[collection_data_name]
# Get data
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
#
# RUN
#
# report name for logging purposes
report_name = f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}"
# max runtime
max_runtime = int(self.max_runtime)
# Retrieve the search cron schedule
savedsearch = self.service.saved_searches[report_name]
savedsearch_cron_schedule = savedsearch.content["cron_schedule"]
# get the cron_exec_sequence_sec
try:
cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"'
)
cron_exec_sequence_sec = max_runtime
# the max_runtime cannot be bigger than the cron_exec_sequence_sec
if max_runtime > cron_exec_sequence_sec:
max_runtime = cron_exec_sequence_sec
logging.info(
f'max_runtime="{max_runtime}", savedsearch_name="{report_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
)
# If object_id is provided, resolve it to object value for the macro
object_for_search = self.object
if self.object_id != "*" and self.object == "*":
# Query KVstore to get object value from object_id
try:
query_string_filter = {
"object_category": f"splk-{self.component}",
"_key": self.object_id,
}
query_string = {"$and": [query_string_filter]}
records_outliers_rules = collection_rule.data.query(
query=json.dumps(query_string)
)
if records_outliers_rules:
record_outliers_rules = records_outliers_rules[0]
object_for_search = record_outliers_rules.get("object", "*")
logging.debug(
f'Resolved object_id="{self.object_id}" to object="{object_for_search}"'
)
else:
logging.warning(
f'Could not resolve object_id="{self.object_id}" to object value, using "*"'
)
object_for_search = "*"
except Exception as e:
logging.error(
f'Failed to resolve object_id="{self.object_id}" to object value, exception="{str(e)}", using "*"'
)
object_for_search = "*"
# Define the search providing the list of entities which models need to be monitored
if self.force_run == "False":
search_query = remove_leading_spaces(
f"""
| `get_splk_outliers_entities("{self.tenant_id}", "{self.component}", "{object_for_search}")`
| eval duration_since_last=if(isnum(last_exec_monitor), now()-last_exec_monitor, 0)
| where duration_since_last=0 OR duration_since_last>={splk_outliers_time_monitor_mlmodels_default}
| sort - duration_since_last
"""
)
else:
search_query = remove_leading_spaces(
f"""
| `get_splk_outliers_entities("{self.tenant_id}", "{self.component}", "{object_for_search}")`
| eval duration_since_last=if(isnum(last_exec_monitor), now()-last_exec_monitor, 0)
| sort - duration_since_last
"""
)
logging.debug(f'search_query="{search_query}"')
# run search
try:
reader = run_splunk_search(
self.service,
search_query,
kwargs_oneshot,
24,
5,
)
# loop through the results, and train models per entity
# store processed entities in a list
processed_entities = []
# Initialize sum of execution times and count of iterations
total_execution_time = 0
iteration_count = 0
# Other initializations
max_runtime = int(self.max_runtime)
for item in reader:
logging.debug(f'search_results="{item}"')
current_time = time.time()
elapsed_time = current_time - start
if isinstance(item, dict):
# iteration start
iteration_start_time = time.time()
# run the resulting search
object_value = decode_unicode(item.get("object"))
model_ids = item.get("model_id")
# set the global_isOutlier
global_isOutlier = 0
global_models_in_anomaly = []
global_isOutlierReason = []
# Define the KV query
query_string_filter = {
"object_category": f"splk-{self.component}",
"object": object_value,
}
query_string = {"$and": [query_string_filter]}
# Get the current record
# Notes: the record is returned as an array, as we search for a specific record, we expect one record only
key = None
rules_key = None # Store rules key for logging purposes
try:
records_outliers_rules = collection_rule.data.query(
query=json.dumps(query_string)
)
record_outliers_rules = records_outliers_rules[0]
key = record_outliers_rules.get("_key")
rules_key = key # Store for logging
except Exception as e:
key = None
rules_key = None
record_outliers_rules = None
# if no records, log a warning message and break
if not key:
object_ref = self._get_log_object_ref(object_value=self.object)
msg = f'tenant_id="{self.tenant_id}", object_category="splk-{self.component}", {object_ref} outliers rules record cannot be found or are not yet available for this entity.'
logging.warn(msg)
break
#
# ML confidence
#
# retrieve the values for confidence and confidence_reason from the rules KVstore
ml_confidence = record_outliers_rules.get("confidence", "low")
# log debug
logging.debug(
f'record_outliers_rules="{record_outliers_rules}"'
)
# Get the JSON outliers rules object
entities_outliers = record_outliers_rules.get(
"entities_outliers"
)
# Load as a dict
try:
entities_outliers = json.loads(
record_outliers_rules.get("entities_outliers")
)
except Exception as e:
msg = f'Failed to load entities_outliers with exception="{str(e)}"'
# log debug
logging.debug(f'entities_outliers="{entities_outliers}"')
# Load the general enablement
try:
outliers_is_disabled = int(
record_outliers_rules.get("is_disabled")
)
logging.debug(f'is_disabled="{outliers_is_disabled}"')
except Exception as e:
msg = 'Failed to extract one or more expected settings from the entity, is this record corrupted? Exception="{}"'
logging.error(msg)
outliers_is_disabled = 1
# process the entity if general outliers is enabled
if outliers_is_disabled == 0:
# Process all ML models per entity
processed_entity_models = {}
if model_ids:
model_ids = model_ids.split(",")
# loop through each model per entity
for model_id in model_ids:
# model configuration
try:
model_config = entities_outliers[model_id]
except Exception as e:
model_config = None
if model_config:
logging.debug(
f'configuration for model_id="{model_id}" config="{json.dumps(model_config, indent=4)}"'
)
# If the model is enabled
if int(model_config["is_disabled"]) == 0:
# Get conf from the model
alert_lower_breached = int(
model_config["alert_lower_breached"]
)
alert_upper_breached = int(
model_config["alert_upper_breached"]
)
# get the kpi metric name and value
kpi_metric_name = model_config["kpi_metric"]
logging.debug(
f'kpi_metric_name="{kpi_metric_name}"'
)
# retrieve the score from the model configuration
try:
score = int(model_config.get("score", 36))
except (ValueError, TypeError):
score = 36
# Set the initial state for that model
isOutlier = 0
isOutlierReason = "None"
# Set the search - use object_id if available, otherwise fall back to object
if key:
object_param = f'object_id="{key}"'
else:
object_param = f'object="{object_value}"'
model_render_search = remove_leading_spaces(
f"""\
| trackmesplkoutliersrender tenant_id="{self.tenant_id}" component="{self.component}" {object_param} earliest="-24h" latest="now" model_id="{model_id}" allow_auto_train="{self.allow_auto_train}"
| table _time, *, LowerBound, UpperBound
| sort 0 - _time | head 1
"""
)
logging.info(
f'tenant_id="{self.tenant_id}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_id="{model_id}", Executing resulting search="{model_render_search}"'
)
# set kwargs
kwargs_oneshot = {
"earliest_time": "-24h",
"latest_time": "now",
"search_mode": "normal",
"preview": False,
"count": 0,
"output_mode": "json",
}
# Performance timer
substart = time.time()
# Run the search
search_results = None
try:
reader = run_splunk_search(
self.service,
model_render_search,
kwargs_oneshot,
24,
5,
)
# loop through the reader results
for item in reader:
if isinstance(item, dict):
search_results = item
# raw results logged only in debug
logging.debug(
f'search_results="{search_results}"'
)
# Inspect results
time_outlier = search_results[
"_time"
]
# get rejectedLowerboundOutlier / rejectedUpperboundOutlier / rejectedLowerboundOutlierReason / rejectedUpperboundOutlierReason
try:
rejectedLowerboundOutlier = int(
(
search_results[
"rejectedLowerboundOutlier"
]
)
)
except Exception as e:
rejectedLowerboundOutlier = (
0
)
try:
rejectedUpperboundOutlier = int(
(
search_results[
"rejectedUpperboundOutlier"
]
)
)
except Exception as e:
rejectedUpperboundOutlier = (
0
)
try:
rejectedLowerboundOutlierReason = search_results[
"rejectedLowerboundOutlierReason"
]
except Exception as e:
rejectedLowerboundOutlierReason = (
"N/A"
)
try:
rejectedUpperboundOutlierReason = search_results[
"rejectedUpperboundOutlierReason"
]
except Exception as e:
rejectedUpperboundOutlierReason = (
"N/A"
)
# try to get the LowerBound and UpperBound
try:
LowerBound = search_results[
"LowerBound"
]
except Exception as e:
LowerBound = None
logging.warning(
f'Could not retrieve a LowerBound value from item="{item}"'
)
try:
UpperBound = search_results[
"UpperBound"
]
except Exception as e:
UpperBound = None
logging.warning(
f'Could not retrieve a UpperBound value from item="{item}"'
)
try:
kpi_metric_value = (
search_results[
kpi_metric_name
]
)
logging.debug(
f'kpi_metric_value="{kpi_metric_value}"'
)
except Exception as e:
kpi_metric_value = None
logging.warning(
f'Could not retrieve the kpi_metric_value from item="{item}"'
)
# Define the outliers status
if (
kpi_metric_value
and LowerBound
and UpperBound
):
if int(
alert_lower_breached
) == 1 and float(
kpi_metric_value
) < float(
LowerBound
):
# Enforce policy for rejected lowerBound outliers
if (
rejectedLowerboundOutlier
== 1
):
isOutlier = 0
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", LowerBound="{round(float(LowerBound), 3)}" has been rejected, rejectedLowerboundOutlierReason="{rejectedLowerboundOutlierReason}", kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", Outlier will not be considered.'
# Accept Outlier
else:
isOutlier = 1
pct_decrease = (
(
float(
LowerBound
)
- float(
kpi_metric_value
)
)
/ float(
LowerBound
)
) * 100
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", LowerBound="{round(float(LowerBound), 3)}" breached with kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", pct_decrease="{round(float(pct_decrease), 2)}"'
# add to scoring metrics records
scoring_metrics_records.append(
{
"tenant_id": self.tenant_id,
"object_id": key,
"object": object_value,
"object_category": self.component,
"score_source": f"lowerbound_outlier|model_id={model_id}",
"metrics_event": {"score": score, "pct_decrease": round(float(pct_decrease), 2)},
}
)
elif int(
alert_upper_breached
) == 1 and float(
kpi_metric_value
) > float(
UpperBound
):
# Enforce policy for rejected upperBound outliers
if (
rejectedUpperboundOutlier
== 1
):
isOutlier = 0
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", UpperBound="{round(float(UpperBound), 3)}" has been rejected, rejectedUpperboundOutlierReason="{rejectedUpperboundOutlierReason}", kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", Outlier will not be considered.'
# Accept Outlier
else:
isOutlier = 1
pct_increase = (
(
float(
kpi_metric_value
)
- float(
UpperBound
)
)
/ float(
UpperBound
)
) * 100
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", UpperBound="{round(float(UpperBound), 3)}" breached with kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", pct_increase="{round(float(pct_increase), 2)}"'
# add to scoring metrics records
scoring_metrics_records.append(
{
"tenant_id": self.tenant_id,
"object_id": key,
"object": object_value,
"object_category": self.component,
"score_source": f"upperbound_outlier|model_id={model_id}",
"metrics_event": {"score": score, "pct_increase": round(float(pct_increase), 2)},
}
)
else:
isOutlier = 0
isOutlierReason = "No outliers anomalies were detected"
# impact the global_isOutlier accordingly
if isOutlier == 1:
# only if the confidence allows it
if (
ml_confidence
!= "low"
):
global_isOutlier = 1
global_models_in_anomaly.append(
model_id
)
global_isOutlierReason.append(
isOutlierReason
)
except Exception as e:
error_msg = f'tenant_id="{self.tenant_id}", object_category="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_id="{model_id}", search has failed with the following exception="{str(e)}", search="{model_render_search}"'
logging.error(error_msg)
# Performance timer
model_search_runtime = round(
float(time.time()) - float(substart), 3
)
# try loading search results and define a message if did not produce any results
try:
summary_search_results = {
"time": search_results["_time"],
"_raw": json.loads(
search_results["_raw"]
),
"model_render_search": model_render_search,
}
except Exception as e:
summary_search_results = "Outliers search did not produce any results"
# Insert the summary record
model_summary = {
"isOutlier": isOutlier,
"isOutlierReason": isOutlierReason,
"alert_lower_breached": alert_lower_breached,
"alert_upper_breached": alert_upper_breached,
"summary_search_results": summary_search_results,
"search_run_time": model_search_runtime,
"time_exec": time.time(),
"time_human": time.strftime(
"%c", time.localtime(time.time())
),
}
processed_entity_models[model_id] = (
model_summary
)
# log info
logging.info(
f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_summary="{json.dumps(model_summary, indent=4)}"'
)
# summary record for that entity
processed_entity_record = {
"entity": object_value,
"processed_model_ids": processed_entity_models,
}
# append
processed_entities.append(processed_entity_record)
# increment the entity counter
count += 1
#
# Finally, update the outliers data KVstore
#
# Define the KV query
query_string_filter = {
"object_category": f"splk-{self.component}",
"object": object_value,
}
query_string = {"$and": [query_string_filter]}
# Get the current record
# Notes: the record is returned as an array, as we search for a specific record, we expect one record only
key = None
try:
records_outliers_data = collection_data.data.query(
query=json.dumps(query_string)
)
record_outliers_data = records_outliers_data[0]
key = record_outliers_data.get("_key")
except Exception as e:
key = None
if not key:
# new record
try:
collection_data.data.insert(
json.dumps(
{
"_key": hashlib.sha256(
object_value.encode("utf-8")
).hexdigest(),
"object": str(object_value),
"object_category": f"splk-{self.component}",
"mtime": str(time.time()),
"isOutlier": global_isOutlier,
"isOutlierReason": global_isOutlierReason,
"models_in_anomaly": global_models_in_anomaly,
"models_summary": json.dumps(
processed_entity_models,
indent=4,
),
}
)
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, failed to insert a new KVstore record with exception="{str(e)}"'
)
else:
try:
# update existing record
collection_data.data.update(
str(key),
json.dumps(
{
"object": str(object_value),
"object_category": f"splk-{self.component}",
"mtime": str(time.time()),
"isOutlier": global_isOutlier,
"isOutlierReason": global_isOutlierReason,
"models_in_anomaly": global_models_in_anomaly,
"models_summary": json.dumps(
processed_entity_models,
indent=4,
),
}
),
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, failed to update a KVstore record with exception="{str(e)}"'
)
# Calculate the execution time for this iteration
iteration_end_time = time.time()
execution_time = iteration_end_time - iteration_start_time
# Update total execution time and iteration count
total_execution_time += execution_time
iteration_count += 1
# Calculate average execution time
if iteration_count > 0:
average_execution_time = (
total_execution_time / iteration_count
)
else:
average_execution_time = 0
# Check if there is enough time left to continue
current_time = time.time()
elapsed_time = current_time - start
if elapsed_time + average_execution_time + 120 >= max_runtime:
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
)
break
# end
if int(count) > 0:
logging.info(
f'tenant_id="{self.tenant_id}" outliers tracker job successfully executed, status="success", run_time="{round(time.time() - start, 3)}", report="{str(report_name)}", entities_count="{str(count)}"'
)
# yield
results_dict = {
"tenant_id": self.tenant_id,
"action": "success",
"results": "outliers tracker job successfully executed",
"run_time": round((time.time() - start), 3),
"entities_count": str(count),
"processed_entities": processed_entities,
"upstream_search_query": search_query,
}
yield {"_time": time.time(), "_raw": results_dict}
# handler event
handler_events_records = []
for object_record in processed_entities:
handler_events_records.append(
{
"object": object_record.get("entity"),
"object_id": hashlib.sha256(
object_record.get("entity").encode("utf-8")
).hexdigest(),
"object_category": f"splk-{self.component}",
"handler": f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}",
"handler_message": "Entity was rendered for ML Outliers.",
"handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmesplkoutliersrender tenant_id={self.tenant_id} object=\"{object_record.get('entity')}\"",
"handler_time": time.time(),
}
)
# notification event
try:
trackme_handler_events(
session_key=self._metadata.searchinfo.session_key,
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
tenant_id=self.tenant_id,
sourcetype="trackme:handler",
source=f"trackme:handler:{self.tenant_id}",
handler_events=handler_events_records,
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"'
)
# call the scoring gen metrics function
scoring_metrics_gen_start = time.time()
try:
scoring_metrics = trackme_scoring_gen_metrics(
self.tenant_id,
tenant_indexes.get("trackme_metric_idx"),
scoring_metrics_records,
)
logging.info(
f'context="scoring_gen_metrics", tenant_id="{self.tenant_id}", function trackme_scoring_gen_metrics success {scoring_metrics}, run_time={round(time.time()-scoring_metrics_gen_start, 3)}, no_entities={len(scoring_metrics_records)}'
)
except Exception as e:
logging.error(
f'context="scoring_gen_metrics", tenant_id="{self.tenant_id}", function trackme_scoring_gen_metrics failed with exception {str(e)}'
)
# also generate events for the score
for score_event in scoring_metrics_records:
try:
trackme_gen_state(
index=tenant_indexes.get("trackme_summary_idx"),
sourcetype="trackme:score",
source=f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}",
event=score_event,
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="{self.component}", failed to generate score state event, exception="{str(e)}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}" outliers tracker job successfully executed but there were no entities to be tracked at this time, status="success", run_time="{round(time.time() - start, 3)}", report="{str(report_name)}", entities_count="{str(count)}"'
)
# yield
results_dict = {
"tenant_id": self.tenant_id,
"action": "success",
"results": "outliers tracker job successfully executed but there were no entities to be tracked at this time",
"run_time": round((time.time() - start), 3),
"entities_count": str(count),
"upstream_search_query": search_query,
}
yield {"_time": time.time(), "_raw": results_dict}
# Call the component register
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
f"splk-{self.component}",
f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{str(self.tenant_id)}",
"success",
time.time(),
round(time.time() - start, 3),
"The report was executed successfully",
"-24h",
"now",
)
except Exception as e:
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
f"splk-{self.component}",
f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{str(self.tenant_id)}",
"failure",
time.time(),
round(time.time() - start, 3),
str(e),
"-24h",
"now",
)
msg = f'tenant_id="{self.tenant_id}", permanent search failure, exception="{str(e)}", search_query="{search_query}", search_kwargs="{kwargs_oneshot}"'
logging.error(msg)
raise Exception(msg)
dispatch(SplkOutliersTracker, sys.argv, sys.stdin, sys.stdout, __name__)