You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1084 lines
57 KiB
1084 lines
57 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import hashlib
|
|
|
|
# Logging imports
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Networking imports
|
|
import urllib3
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_splk_outliers_tracker_helper.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
GeneratingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_reqinfo,
|
|
trackme_vtenant_account,
|
|
trackme_vtenant_component_info,
|
|
trackme_register_tenant_object_summary,
|
|
run_splunk_search,
|
|
trackme_handler_events,
|
|
trackme_idx_for_tenant,
|
|
trackme_gen_state,
|
|
)
|
|
|
|
# import trackme libs utils
|
|
from trackme_libs_utils import remove_leading_spaces, decode_unicode
|
|
|
|
# import trackme libs croniter
|
|
from trackme_libs_croniter import cron_to_seconds
|
|
|
|
# import trackme libs scoring
|
|
from trackme_libs_scoring import trackme_scoring_gen_metrics
|
|
|
|
@Configuration(distributed=False)
|
|
class SplkOutliersTracker(GeneratingCommand):
|
|
tenant_id = Option(
|
|
doc="""
|
|
**Syntax:** **tenant_id=****
|
|
**Description:** The tenant identifier.""",
|
|
require=True,
|
|
default=None,
|
|
)
|
|
|
|
component = Option(
|
|
doc="""
|
|
**Syntax:** **component=****
|
|
**Description:** The component category.""",
|
|
require=True,
|
|
default=None,
|
|
validate=validators.Match("component", r"^(?:dsm|dhm|flx|fqm|wlk)$"),
|
|
)
|
|
|
|
object = Option(
|
|
doc="""
|
|
**Syntax:** **object=****
|
|
**Description:** Optional, The value for object.""",
|
|
require=False,
|
|
default="*",
|
|
validate=validators.Match("object", r"^.*$"),
|
|
)
|
|
|
|
object_id = Option(
|
|
doc="""
|
|
**Syntax:** **object_id=****
|
|
**Description:** Optional, The value for object id.""",
|
|
require=False,
|
|
default="*",
|
|
validate=validators.Match("object_id", r"^.*$"),
|
|
)
|
|
|
|
max_runtime = Option(
|
|
doc="""
|
|
**Syntax:** **max_runtime=****
|
|
**Description:** Optional, The max value in seconds for the total runtime of the job, defaults to 900 (15 min) which is substracted by 120 sec of margin. Once the job reaches this, it gets terminated""",
|
|
require=False,
|
|
default="900",
|
|
validate=validators.Match("object", r"^\d*$"),
|
|
)
|
|
|
|
def _get_log_object_ref(self, object_value=None, object_id_value=None):
|
|
"""Helper function to get object reference for logging (includes object_id when available)."""
|
|
object_id_ref = f'object_id="{object_id_value}"' if object_id_value else ""
|
|
object_ref = f'object="{object_value}"' if object_value else ""
|
|
if object_id_ref and object_ref:
|
|
return f'{object_id_ref}, {object_ref}'
|
|
elif object_id_ref:
|
|
return object_id_ref
|
|
elif object_ref:
|
|
return object_ref
|
|
else:
|
|
return 'object="*"'
|
|
|
|
force_run = Option(
|
|
doc="""
|
|
**Syntax:** **force_run=****
|
|
**Description:** Optional, force run monitor, if set to True we will not honour the minimal time betwen two monitor execution.""",
|
|
require=False,
|
|
default="False",
|
|
validate=validators.Match("force_run", r"^(?:True|False)$"),
|
|
)
|
|
|
|
allow_auto_train = Option(
|
|
doc="""
|
|
**Syntax:** **allow_auto_train=****
|
|
**Description:** Allows automated ML training if not trained since more than system wide parameter.""",
|
|
require=False,
|
|
default=True,
|
|
)
|
|
|
|
def generate(self, **kwargs):
|
|
# performance counter
|
|
start = time.time()
|
|
|
|
# Track execution times
|
|
execution_times = []
|
|
average_execution_time = 0
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# Get Virtual Tenant account
|
|
vtenant_account = trackme_vtenant_account(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
|
|
# get vtenant component info
|
|
vtenant_component_info = trackme_vtenant_component_info(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
logging.debug(
|
|
f'vtenant_component_info="{json.dumps(vtenant_component_info, indent=2)}"'
|
|
)
|
|
|
|
# Get the tenant indexes
|
|
tenant_indexes = trackme_idx_for_tenant(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
|
|
# check schema version migration state
|
|
try:
|
|
schema_version = int(vtenant_component_info["schema_version"])
|
|
schema_version_upgrade_in_progress = bool(
|
|
int(vtenant_component_info["schema_version_upgrade_in_progress"])
|
|
)
|
|
logging.debug(
|
|
f'schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"'
|
|
)
|
|
except Exception as e:
|
|
schema_version = 0
|
|
schema_version_upgrade_in_progress = False
|
|
logging.error(
|
|
f'failed to retrieve schema_version_upgrade_in_progress=, exception="{str(e)}"'
|
|
)
|
|
|
|
# Do not proceed if the schema version upgrade is in progress
|
|
if schema_version_upgrade_in_progress:
|
|
yield_json = {
|
|
"_time": time.time(),
|
|
"tenant_id": self.tenant_id,
|
|
"component": self.component,
|
|
"response": f'tenant_id="{self.tenant_id}", schema upgrade is currently in progress, we will wait until the process is completed before proceeding, the schema upgrade is handled by the health_tracker of the tenant and is completed once the schema_version field of the Virtual Tenants KVstore (trackme_virtual_tenants) matches TrackMe\'s version, schema_version="{schema_version}", schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"',
|
|
"schema_version": schema_version,
|
|
"schema_version_upgrade_in_progress": schema_version_upgrade_in_progress,
|
|
}
|
|
logging.info(json.dumps(yield_json, indent=2))
|
|
yield {
|
|
"_time": yield_json["_time"],
|
|
"_raw": yield_json,
|
|
}
|
|
|
|
# Default to True (ML Outliers enabled)
|
|
outliers_feature_enabled = True
|
|
|
|
# Define the valid components
|
|
valid_components = {"dsm", "dhm", "flx", "fqm", "wlk"}
|
|
|
|
# Construct the key dynamically
|
|
key = f"mloutliers_{self.component}"
|
|
|
|
# Check if the component is valid and handle exceptions
|
|
if self.component in valid_components:
|
|
try:
|
|
if int(vtenant_account.get(key, 1)) == 0:
|
|
outliers_feature_enabled = False
|
|
except (ValueError, TypeError):
|
|
outliers_feature_enabled = True
|
|
|
|
if not outliers_feature_enabled or schema_version_upgrade_in_progress:
|
|
|
|
if not outliers_feature_enabled:
|
|
# yield and log
|
|
results_dict = {
|
|
"tenant_id": self.tenant_id,
|
|
"action": "success",
|
|
"results": "ML Anomaly Detection feature is disabled for the tenant and component, no action taken",
|
|
"vtenant_account": vtenant_account,
|
|
}
|
|
yield {"_time": time.time(), "_raw": results_dict}
|
|
|
|
else: # procees
|
|
|
|
# Get app level config
|
|
splk_outliers_time_monitor_mlmodels_default = reqinfo["trackme_conf"][
|
|
"splk_outliers_detection"
|
|
]["splk_outliers_time_monitor_mlmodels_default"]
|
|
|
|
# counter
|
|
count = 0
|
|
|
|
# scoring metrics records
|
|
scoring_metrics_records = []
|
|
|
|
# Get the session key
|
|
session_key = self._metadata.searchinfo.session_key
|
|
|
|
# Outliers rules storage collection
|
|
collection_rules_name = f"kv_trackme_{self.component}_outliers_entity_rules_tenant_{str(self.tenant_id)}"
|
|
collection_rule = self.service.kvstore[collection_rules_name]
|
|
|
|
# Outliers data storage collection
|
|
collection_data_name = f"kv_trackme_{self.component}_outliers_entity_data_tenant_{str(self.tenant_id)}"
|
|
collection_data = self.service.kvstore[collection_data_name]
|
|
|
|
# Get data
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
#
|
|
# RUN
|
|
#
|
|
|
|
# report name for logging purposes
|
|
report_name = f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}"
|
|
|
|
# max runtime
|
|
max_runtime = int(self.max_runtime)
|
|
|
|
# Retrieve the search cron schedule
|
|
savedsearch = self.service.saved_searches[report_name]
|
|
savedsearch_cron_schedule = savedsearch.content["cron_schedule"]
|
|
|
|
# get the cron_exec_sequence_sec
|
|
try:
|
|
cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"'
|
|
)
|
|
cron_exec_sequence_sec = max_runtime
|
|
|
|
# the max_runtime cannot be bigger than the cron_exec_sequence_sec
|
|
if max_runtime > cron_exec_sequence_sec:
|
|
max_runtime = cron_exec_sequence_sec
|
|
|
|
logging.info(
|
|
f'max_runtime="{max_runtime}", savedsearch_name="{report_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
|
|
)
|
|
|
|
# If object_id is provided, resolve it to object value for the macro
|
|
object_for_search = self.object
|
|
if self.object_id != "*" and self.object == "*":
|
|
# Query KVstore to get object value from object_id
|
|
try:
|
|
query_string_filter = {
|
|
"object_category": f"splk-{self.component}",
|
|
"_key": self.object_id,
|
|
}
|
|
query_string = {"$and": [query_string_filter]}
|
|
records_outliers_rules = collection_rule.data.query(
|
|
query=json.dumps(query_string)
|
|
)
|
|
if records_outliers_rules:
|
|
record_outliers_rules = records_outliers_rules[0]
|
|
object_for_search = record_outliers_rules.get("object", "*")
|
|
logging.debug(
|
|
f'Resolved object_id="{self.object_id}" to object="{object_for_search}"'
|
|
)
|
|
else:
|
|
logging.warning(
|
|
f'Could not resolve object_id="{self.object_id}" to object value, using "*"'
|
|
)
|
|
object_for_search = "*"
|
|
except Exception as e:
|
|
logging.error(
|
|
f'Failed to resolve object_id="{self.object_id}" to object value, exception="{str(e)}", using "*"'
|
|
)
|
|
object_for_search = "*"
|
|
|
|
# Define the search providing the list of entities which models need to be monitored
|
|
if self.force_run == "False":
|
|
search_query = remove_leading_spaces(
|
|
f"""
|
|
| `get_splk_outliers_entities("{self.tenant_id}", "{self.component}", "{object_for_search}")`
|
|
| eval duration_since_last=if(isnum(last_exec_monitor), now()-last_exec_monitor, 0)
|
|
| where duration_since_last=0 OR duration_since_last>={splk_outliers_time_monitor_mlmodels_default}
|
|
| sort - duration_since_last
|
|
"""
|
|
)
|
|
|
|
else:
|
|
search_query = remove_leading_spaces(
|
|
f"""
|
|
| `get_splk_outliers_entities("{self.tenant_id}", "{self.component}", "{object_for_search}")`
|
|
| eval duration_since_last=if(isnum(last_exec_monitor), now()-last_exec_monitor, 0)
|
|
| sort - duration_since_last
|
|
"""
|
|
)
|
|
|
|
logging.debug(f'search_query="{search_query}"')
|
|
|
|
# run search
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search_query,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
# loop through the results, and train models per entity
|
|
|
|
# store processed entities in a list
|
|
processed_entities = []
|
|
|
|
# Initialize sum of execution times and count of iterations
|
|
total_execution_time = 0
|
|
iteration_count = 0
|
|
|
|
# Other initializations
|
|
max_runtime = int(self.max_runtime)
|
|
|
|
for item in reader:
|
|
logging.debug(f'search_results="{item}"')
|
|
|
|
current_time = time.time()
|
|
elapsed_time = current_time - start
|
|
|
|
if isinstance(item, dict):
|
|
|
|
# iteration start
|
|
iteration_start_time = time.time()
|
|
|
|
# run the resulting search
|
|
object_value = decode_unicode(item.get("object"))
|
|
model_ids = item.get("model_id")
|
|
|
|
# set the global_isOutlier
|
|
global_isOutlier = 0
|
|
global_models_in_anomaly = []
|
|
global_isOutlierReason = []
|
|
|
|
# Define the KV query
|
|
query_string_filter = {
|
|
"object_category": f"splk-{self.component}",
|
|
"object": object_value,
|
|
}
|
|
|
|
query_string = {"$and": [query_string_filter]}
|
|
|
|
# Get the current record
|
|
# Notes: the record is returned as an array, as we search for a specific record, we expect one record only
|
|
key = None
|
|
rules_key = None # Store rules key for logging purposes
|
|
|
|
try:
|
|
records_outliers_rules = collection_rule.data.query(
|
|
query=json.dumps(query_string)
|
|
)
|
|
record_outliers_rules = records_outliers_rules[0]
|
|
key = record_outliers_rules.get("_key")
|
|
rules_key = key # Store for logging
|
|
|
|
except Exception as e:
|
|
key = None
|
|
rules_key = None
|
|
record_outliers_rules = None
|
|
|
|
# if no records, log a warning message and break
|
|
if not key:
|
|
object_ref = self._get_log_object_ref(object_value=self.object)
|
|
msg = f'tenant_id="{self.tenant_id}", object_category="splk-{self.component}", {object_ref} outliers rules record cannot be found or are not yet available for this entity.'
|
|
logging.warn(msg)
|
|
break
|
|
|
|
#
|
|
# ML confidence
|
|
#
|
|
|
|
# retrieve the values for confidence and confidence_reason from the rules KVstore
|
|
ml_confidence = record_outliers_rules.get("confidence", "low")
|
|
|
|
# log debug
|
|
logging.debug(
|
|
f'record_outliers_rules="{record_outliers_rules}"'
|
|
)
|
|
|
|
# Get the JSON outliers rules object
|
|
entities_outliers = record_outliers_rules.get(
|
|
"entities_outliers"
|
|
)
|
|
|
|
# Load as a dict
|
|
try:
|
|
entities_outliers = json.loads(
|
|
record_outliers_rules.get("entities_outliers")
|
|
)
|
|
except Exception as e:
|
|
msg = f'Failed to load entities_outliers with exception="{str(e)}"'
|
|
|
|
# log debug
|
|
logging.debug(f'entities_outliers="{entities_outliers}"')
|
|
|
|
# Load the general enablement
|
|
try:
|
|
outliers_is_disabled = int(
|
|
record_outliers_rules.get("is_disabled")
|
|
)
|
|
logging.debug(f'is_disabled="{outliers_is_disabled}"')
|
|
|
|
except Exception as e:
|
|
msg = 'Failed to extract one or more expected settings from the entity, is this record corrupted? Exception="{}"'
|
|
logging.error(msg)
|
|
outliers_is_disabled = 1
|
|
|
|
# process the entity if general outliers is enabled
|
|
if outliers_is_disabled == 0:
|
|
# Process all ML models per entity
|
|
processed_entity_models = {}
|
|
|
|
if model_ids:
|
|
model_ids = model_ids.split(",")
|
|
|
|
# loop through each model per entity
|
|
for model_id in model_ids:
|
|
# model configuration
|
|
try:
|
|
model_config = entities_outliers[model_id]
|
|
except Exception as e:
|
|
model_config = None
|
|
|
|
if model_config:
|
|
|
|
logging.debug(
|
|
f'configuration for model_id="{model_id}" config="{json.dumps(model_config, indent=4)}"'
|
|
)
|
|
|
|
# If the model is enabled
|
|
if int(model_config["is_disabled"]) == 0:
|
|
# Get conf from the model
|
|
alert_lower_breached = int(
|
|
model_config["alert_lower_breached"]
|
|
)
|
|
alert_upper_breached = int(
|
|
model_config["alert_upper_breached"]
|
|
)
|
|
|
|
# get the kpi metric name and value
|
|
kpi_metric_name = model_config["kpi_metric"]
|
|
logging.debug(
|
|
f'kpi_metric_name="{kpi_metric_name}"'
|
|
)
|
|
|
|
# retrieve the score from the model configuration
|
|
try:
|
|
score = int(model_config.get("score", 36))
|
|
except (ValueError, TypeError):
|
|
score = 36
|
|
|
|
# Set the initial state for that model
|
|
isOutlier = 0
|
|
isOutlierReason = "None"
|
|
|
|
# Set the search - use object_id if available, otherwise fall back to object
|
|
if key:
|
|
object_param = f'object_id="{key}"'
|
|
else:
|
|
object_param = f'object="{object_value}"'
|
|
|
|
model_render_search = remove_leading_spaces(
|
|
f"""\
|
|
| trackmesplkoutliersrender tenant_id="{self.tenant_id}" component="{self.component}" {object_param} earliest="-24h" latest="now" model_id="{model_id}" allow_auto_train="{self.allow_auto_train}"
|
|
| table _time, *, LowerBound, UpperBound
|
|
| sort 0 - _time | head 1
|
|
"""
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_id="{model_id}", Executing resulting search="{model_render_search}"'
|
|
)
|
|
|
|
# set kwargs
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-24h",
|
|
"latest_time": "now",
|
|
"search_mode": "normal",
|
|
"preview": False,
|
|
"count": 0,
|
|
"output_mode": "json",
|
|
}
|
|
|
|
# Performance timer
|
|
substart = time.time()
|
|
|
|
# Run the search
|
|
search_results = None
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
model_render_search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
# loop through the reader results
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
search_results = item
|
|
|
|
# raw results logged only in debug
|
|
logging.debug(
|
|
f'search_results="{search_results}"'
|
|
)
|
|
|
|
# Inspect results
|
|
time_outlier = search_results[
|
|
"_time"
|
|
]
|
|
|
|
# get rejectedLowerboundOutlier / rejectedUpperboundOutlier / rejectedLowerboundOutlierReason / rejectedUpperboundOutlierReason
|
|
try:
|
|
rejectedLowerboundOutlier = int(
|
|
(
|
|
search_results[
|
|
"rejectedLowerboundOutlier"
|
|
]
|
|
)
|
|
)
|
|
except Exception as e:
|
|
rejectedLowerboundOutlier = (
|
|
0
|
|
)
|
|
|
|
try:
|
|
rejectedUpperboundOutlier = int(
|
|
(
|
|
search_results[
|
|
"rejectedUpperboundOutlier"
|
|
]
|
|
)
|
|
)
|
|
except Exception as e:
|
|
rejectedUpperboundOutlier = (
|
|
0
|
|
)
|
|
|
|
try:
|
|
rejectedLowerboundOutlierReason = search_results[
|
|
"rejectedLowerboundOutlierReason"
|
|
]
|
|
except Exception as e:
|
|
rejectedLowerboundOutlierReason = (
|
|
"N/A"
|
|
)
|
|
|
|
try:
|
|
rejectedUpperboundOutlierReason = search_results[
|
|
"rejectedUpperboundOutlierReason"
|
|
]
|
|
except Exception as e:
|
|
rejectedUpperboundOutlierReason = (
|
|
"N/A"
|
|
)
|
|
|
|
# try to get the LowerBound and UpperBound
|
|
try:
|
|
LowerBound = search_results[
|
|
"LowerBound"
|
|
]
|
|
except Exception as e:
|
|
LowerBound = None
|
|
logging.warning(
|
|
f'Could not retrieve a LowerBound value from item="{item}"'
|
|
)
|
|
|
|
try:
|
|
UpperBound = search_results[
|
|
"UpperBound"
|
|
]
|
|
except Exception as e:
|
|
UpperBound = None
|
|
logging.warning(
|
|
f'Could not retrieve a UpperBound value from item="{item}"'
|
|
)
|
|
|
|
try:
|
|
kpi_metric_value = (
|
|
search_results[
|
|
kpi_metric_name
|
|
]
|
|
)
|
|
logging.debug(
|
|
f'kpi_metric_value="{kpi_metric_value}"'
|
|
)
|
|
except Exception as e:
|
|
kpi_metric_value = None
|
|
logging.warning(
|
|
f'Could not retrieve the kpi_metric_value from item="{item}"'
|
|
)
|
|
|
|
# Define the outliers status
|
|
if (
|
|
kpi_metric_value
|
|
and LowerBound
|
|
and UpperBound
|
|
):
|
|
if int(
|
|
alert_lower_breached
|
|
) == 1 and float(
|
|
kpi_metric_value
|
|
) < float(
|
|
LowerBound
|
|
):
|
|
|
|
# Enforce policy for rejected lowerBound outliers
|
|
if (
|
|
rejectedLowerboundOutlier
|
|
== 1
|
|
):
|
|
isOutlier = 0
|
|
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", LowerBound="{round(float(LowerBound), 3)}" has been rejected, rejectedLowerboundOutlierReason="{rejectedLowerboundOutlierReason}", kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", Outlier will not be considered.'
|
|
|
|
# Accept Outlier
|
|
else:
|
|
isOutlier = 1
|
|
pct_decrease = (
|
|
(
|
|
float(
|
|
LowerBound
|
|
)
|
|
- float(
|
|
kpi_metric_value
|
|
)
|
|
)
|
|
/ float(
|
|
LowerBound
|
|
)
|
|
) * 100
|
|
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", LowerBound="{round(float(LowerBound), 3)}" breached with kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", pct_decrease="{round(float(pct_decrease), 2)}"'
|
|
|
|
# add to scoring metrics records
|
|
scoring_metrics_records.append(
|
|
{
|
|
"tenant_id": self.tenant_id,
|
|
"object_id": key,
|
|
"object": object_value,
|
|
"object_category": self.component,
|
|
"score_source": f"lowerbound_outlier|model_id={model_id}",
|
|
"metrics_event": {"score": score, "pct_decrease": round(float(pct_decrease), 2)},
|
|
}
|
|
)
|
|
|
|
elif int(
|
|
alert_upper_breached
|
|
) == 1 and float(
|
|
kpi_metric_value
|
|
) > float(
|
|
UpperBound
|
|
):
|
|
|
|
# Enforce policy for rejected upperBound outliers
|
|
if (
|
|
rejectedUpperboundOutlier
|
|
== 1
|
|
):
|
|
isOutlier = 0
|
|
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", UpperBound="{round(float(UpperBound), 3)}" has been rejected, rejectedUpperboundOutlierReason="{rejectedUpperboundOutlierReason}", kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", Outlier will not be considered.'
|
|
|
|
# Accept Outlier
|
|
else:
|
|
isOutlier = 1
|
|
pct_increase = (
|
|
(
|
|
float(
|
|
kpi_metric_value
|
|
)
|
|
- float(
|
|
UpperBound
|
|
)
|
|
)
|
|
/ float(
|
|
UpperBound
|
|
)
|
|
) * 100
|
|
isOutlierReason = f'Outliers ML for kpi="{kpi_metric_name}", model_id="{model_id}", UpperBound="{round(float(UpperBound), 3)}" breached with kpi_metric_value="{round(float(kpi_metric_value), 3)}" at time="{time_outlier}", pct_increase="{round(float(pct_increase), 2)}"'
|
|
|
|
# add to scoring metrics records
|
|
scoring_metrics_records.append(
|
|
{
|
|
"tenant_id": self.tenant_id,
|
|
"object_id": key,
|
|
"object": object_value,
|
|
"object_category": self.component,
|
|
"score_source": f"upperbound_outlier|model_id={model_id}",
|
|
"metrics_event": {"score": score, "pct_increase": round(float(pct_increase), 2)},
|
|
}
|
|
)
|
|
|
|
else:
|
|
isOutlier = 0
|
|
isOutlierReason = "No outliers anomalies were detected"
|
|
|
|
# impact the global_isOutlier accordingly
|
|
if isOutlier == 1:
|
|
# only if the confidence allows it
|
|
if (
|
|
ml_confidence
|
|
!= "low"
|
|
):
|
|
global_isOutlier = 1
|
|
global_models_in_anomaly.append(
|
|
model_id
|
|
)
|
|
global_isOutlierReason.append(
|
|
isOutlierReason
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f'tenant_id="{self.tenant_id}", object_category="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_id="{model_id}", search has failed with the following exception="{str(e)}", search="{model_render_search}"'
|
|
logging.error(error_msg)
|
|
|
|
# Performance timer
|
|
model_search_runtime = round(
|
|
float(time.time()) - float(substart), 3
|
|
)
|
|
|
|
# try loading search results and define a message if did not produce any results
|
|
try:
|
|
summary_search_results = {
|
|
"time": search_results["_time"],
|
|
"_raw": json.loads(
|
|
search_results["_raw"]
|
|
),
|
|
"model_render_search": model_render_search,
|
|
}
|
|
except Exception as e:
|
|
summary_search_results = "Outliers search did not produce any results"
|
|
|
|
# Insert the summary record
|
|
model_summary = {
|
|
"isOutlier": isOutlier,
|
|
"isOutlierReason": isOutlierReason,
|
|
"alert_lower_breached": alert_lower_breached,
|
|
"alert_upper_breached": alert_upper_breached,
|
|
"summary_search_results": summary_search_results,
|
|
"search_run_time": model_search_runtime,
|
|
"time_exec": time.time(),
|
|
"time_human": time.strftime(
|
|
"%c", time.localtime(time.time())
|
|
),
|
|
}
|
|
processed_entity_models[model_id] = (
|
|
model_summary
|
|
)
|
|
|
|
# log info
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, model_summary="{json.dumps(model_summary, indent=4)}"'
|
|
)
|
|
|
|
# summary record for that entity
|
|
processed_entity_record = {
|
|
"entity": object_value,
|
|
"processed_model_ids": processed_entity_models,
|
|
}
|
|
|
|
# append
|
|
processed_entities.append(processed_entity_record)
|
|
|
|
# increment the entity counter
|
|
count += 1
|
|
|
|
#
|
|
# Finally, update the outliers data KVstore
|
|
#
|
|
|
|
# Define the KV query
|
|
query_string_filter = {
|
|
"object_category": f"splk-{self.component}",
|
|
"object": object_value,
|
|
}
|
|
|
|
query_string = {"$and": [query_string_filter]}
|
|
|
|
# Get the current record
|
|
# Notes: the record is returned as an array, as we search for a specific record, we expect one record only
|
|
key = None
|
|
|
|
try:
|
|
records_outliers_data = collection_data.data.query(
|
|
query=json.dumps(query_string)
|
|
)
|
|
record_outliers_data = records_outliers_data[0]
|
|
key = record_outliers_data.get("_key")
|
|
|
|
except Exception as e:
|
|
key = None
|
|
|
|
if not key:
|
|
# new record
|
|
try:
|
|
collection_data.data.insert(
|
|
json.dumps(
|
|
{
|
|
"_key": hashlib.sha256(
|
|
object_value.encode("utf-8")
|
|
).hexdigest(),
|
|
"object": str(object_value),
|
|
"object_category": f"splk-{self.component}",
|
|
"mtime": str(time.time()),
|
|
"isOutlier": global_isOutlier,
|
|
"isOutlierReason": global_isOutlierReason,
|
|
"models_in_anomaly": global_models_in_anomaly,
|
|
"models_summary": json.dumps(
|
|
processed_entity_models,
|
|
indent=4,
|
|
),
|
|
}
|
|
)
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, failed to insert a new KVstore record with exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
try:
|
|
# update existing record
|
|
collection_data.data.update(
|
|
str(key),
|
|
json.dumps(
|
|
{
|
|
"object": str(object_value),
|
|
"object_category": f"splk-{self.component}",
|
|
"mtime": str(time.time()),
|
|
"isOutlier": global_isOutlier,
|
|
"isOutlierReason": global_isOutlierReason,
|
|
"models_in_anomaly": global_models_in_anomaly,
|
|
"models_summary": json.dumps(
|
|
processed_entity_models,
|
|
indent=4,
|
|
),
|
|
}
|
|
),
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="{self.component}", {self._get_log_object_ref(object_value=object_value, object_id_value=rules_key)}, failed to update a KVstore record with exception="{str(e)}"'
|
|
)
|
|
|
|
# Calculate the execution time for this iteration
|
|
iteration_end_time = time.time()
|
|
execution_time = iteration_end_time - iteration_start_time
|
|
|
|
# Update total execution time and iteration count
|
|
total_execution_time += execution_time
|
|
iteration_count += 1
|
|
|
|
# Calculate average execution time
|
|
if iteration_count > 0:
|
|
average_execution_time = (
|
|
total_execution_time / iteration_count
|
|
)
|
|
else:
|
|
average_execution_time = 0
|
|
|
|
# Check if there is enough time left to continue
|
|
current_time = time.time()
|
|
elapsed_time = current_time - start
|
|
if elapsed_time + average_execution_time + 120 >= max_runtime:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
|
|
)
|
|
break
|
|
|
|
# end
|
|
if int(count) > 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}" outliers tracker job successfully executed, status="success", run_time="{round(time.time() - start, 3)}", report="{str(report_name)}", entities_count="{str(count)}"'
|
|
)
|
|
# yield
|
|
results_dict = {
|
|
"tenant_id": self.tenant_id,
|
|
"action": "success",
|
|
"results": "outliers tracker job successfully executed",
|
|
"run_time": round((time.time() - start), 3),
|
|
"entities_count": str(count),
|
|
"processed_entities": processed_entities,
|
|
"upstream_search_query": search_query,
|
|
}
|
|
yield {"_time": time.time(), "_raw": results_dict}
|
|
|
|
# handler event
|
|
handler_events_records = []
|
|
for object_record in processed_entities:
|
|
handler_events_records.append(
|
|
{
|
|
"object": object_record.get("entity"),
|
|
"object_id": hashlib.sha256(
|
|
object_record.get("entity").encode("utf-8")
|
|
).hexdigest(),
|
|
"object_category": f"splk-{self.component}",
|
|
"handler": f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}",
|
|
"handler_message": "Entity was rendered for ML Outliers.",
|
|
"handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmesplkoutliersrender tenant_id={self.tenant_id} object=\"{object_record.get('entity')}\"",
|
|
"handler_time": time.time(),
|
|
}
|
|
)
|
|
|
|
# notification event
|
|
try:
|
|
trackme_handler_events(
|
|
session_key=self._metadata.searchinfo.session_key,
|
|
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id=self.tenant_id,
|
|
sourcetype="trackme:handler",
|
|
source=f"trackme:handler:{self.tenant_id}",
|
|
handler_events=handler_events_records,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"'
|
|
)
|
|
|
|
# call the scoring gen metrics function
|
|
scoring_metrics_gen_start = time.time()
|
|
try:
|
|
scoring_metrics = trackme_scoring_gen_metrics(
|
|
self.tenant_id,
|
|
tenant_indexes.get("trackme_metric_idx"),
|
|
scoring_metrics_records,
|
|
)
|
|
logging.info(
|
|
f'context="scoring_gen_metrics", tenant_id="{self.tenant_id}", function trackme_scoring_gen_metrics success {scoring_metrics}, run_time={round(time.time()-scoring_metrics_gen_start, 3)}, no_entities={len(scoring_metrics_records)}'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'context="scoring_gen_metrics", tenant_id="{self.tenant_id}", function trackme_scoring_gen_metrics failed with exception {str(e)}'
|
|
)
|
|
|
|
# also generate events for the score
|
|
for score_event in scoring_metrics_records:
|
|
try:
|
|
trackme_gen_state(
|
|
index=tenant_indexes.get("trackme_summary_idx"),
|
|
sourcetype="trackme:score",
|
|
source=f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}",
|
|
event=score_event,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="{self.component}", failed to generate score state event, exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}" outliers tracker job successfully executed but there were no entities to be tracked at this time, status="success", run_time="{round(time.time() - start, 3)}", report="{str(report_name)}", entities_count="{str(count)}"'
|
|
)
|
|
# yield
|
|
results_dict = {
|
|
"tenant_id": self.tenant_id,
|
|
"action": "success",
|
|
"results": "outliers tracker job successfully executed but there were no entities to be tracked at this time",
|
|
"run_time": round((time.time() - start), 3),
|
|
"entities_count": str(count),
|
|
"upstream_search_query": search_query,
|
|
}
|
|
yield {"_time": time.time(), "_raw": results_dict}
|
|
|
|
# Call the component register
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
f"splk-{self.component}",
|
|
f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{str(self.tenant_id)}",
|
|
"success",
|
|
time.time(),
|
|
round(time.time() - start, 3),
|
|
"The report was executed successfully",
|
|
"-24h",
|
|
"now",
|
|
)
|
|
|
|
except Exception as e:
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
f"splk-{self.component}",
|
|
f"trackme_{self.component}_outliers_mlmonitor_tracker_tenant_{str(self.tenant_id)}",
|
|
"failure",
|
|
time.time(),
|
|
round(time.time() - start, 3),
|
|
str(e),
|
|
"-24h",
|
|
"now",
|
|
)
|
|
msg = f'tenant_id="{self.tenant_id}", permanent search failure, exception="{str(e)}", search_query="{search_query}", search_kwargs="{kwargs_oneshot}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
|
|
dispatch(SplkOutliersTracker, sys.argv, sys.stdin, sys.stdout, __name__)
|