You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmesplkadaptivedelay.py

1559 lines
79 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import os
import sys
import time
import json
# Logging imports
import logging
from logging.handlers import RotatingFileHandler
# Networking imports
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
"%s/var/log/splunk/trackme_adaptive_delay.log" % splunkhome,
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# import Splunk libs
from splunklib.searchcommands import (
dispatch,
GeneratingCommand,
Configuration,
Option,
validators,
)
# import trackme libs
from trackme_libs import (
trackme_reqinfo,
trackme_register_tenant_object_summary,
trackme_vtenant_account,
trackme_idx_for_tenant,
run_splunk_search,
trackme_handler_events,
)
# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces
# import trackme libs croniter
from trackme_libs_croniter import cron_to_seconds
@Configuration(distributed=False)
class AdaptiveDelay(GeneratingCommand):
tenant_id = Option(
doc="""
**Syntax:** **tenant_id=****
**Description:** The tenant identifier.""",
require=True,
default=None,
)
component = Option(
doc="""
**Syntax:** **component=****
**Description:** Specify the TrackMe component.""",
require=True,
default=None,
validate=validators.Match("component", r"^(dsm|dhm)$"),
)
min_delay_sec = Option(
doc="""
**Syntax:** **min_delay_sec=<integer>****
**Description:** The minimal delay value for a given entity to be taken into account, expressed in seconds.""",
require=False,
default="3600",
validate=validators.Match("min_hours_delay", r"^\d*$"),
)
max_auto_delay_sec = Option(
doc="""
**Syntax:** **max_auto_delay_sec=<integer>****
**Description:** The maximal delay value that the adaptive backend can set, if the automated delay calculation goes beyond it, this value will be used instead to set the delay, expressed in seconds.""",
require=False,
default="604800",
validate=validators.Match("max_auto_delay_sec", r"^\d*$"),
)
max_changes_past_7days = Option(
doc="""
**Syntax:** **max_changes_past_7days=<integer>****
**Description:** The maximal number of changes that can be performed in a 7 days time frame, once reached we will not update this entity again until the counter is reset.""",
require=False,
default="10",
validate=validators.Match("max_changes_past_7days", r"^\d*$"),
)
min_historical_metrics_days = Option(
doc="""
**Syntax:** **min_historical_metrics_days=<integer>****
**Description:** The minimal number of accumulated days of metrics before we start updating the delay threshold, expressed in days.""",
require=False,
default="7",
validate=validators.Match("min_historical_metrics_days", r"^\d*$"),
)
max_sla_percentage = Option(
doc="""
**Syntax:** **max_sla_percentage=<integer>****
**Description:** The maximum SLA percentage for entities, if the SLA percentage is greater than this value, the delay threshold will not be updated to avoid updating highly stable entities.""",
require=False,
default="90",
validate=validators.Match("max_sla_percentage", r"^\d*$"),
)
earliest_time_mstats = Option(
doc="""
**Syntax:** **earliest_time_mstats=****
**Description:** The earliest time to use for the mstats search.""",
require=False,
default="-30d",
)
max_runtime = Option(
doc="""
**Syntax:** **max_runtime=****
**Description:** The max runtime for the job in seconds, defaults to 15 minutes less 120 seconds of margin.""",
require=False,
default="900",
validate=validators.Match("max_runtime", r"^\d*$"),
)
review_period_no_days = Option(
doc="""
**Syntax:** **review_period_no_days=****
**Description:** The relative time period for review. When entities were updated, TrackMe will review over time the behaviour and eventually adapt the threshold to take into accoount new patterns, expressed in number of days, valid options: 7, 15, 30""",
require=False,
default="30",
validate=validators.Match("review_period_no_days", r"^(7|15|30)$"),
)
def get_collection_records(self, collection, min_delay_sec):
"""
Queries and processes records from a collection based on specific criteria.
:param collection: The collection object to query.
:param min_delay_sec: Minimum delay seconds for processing.
:return: Tuple containing collection records and a dictionary of records.
"""
collection_records = []
collection_records_dict = {}
count_to_process_list = []
end = False
skip_tracker = 0
while not end:
process_collection_records = collection.data.query(skip=skip_tracker)
if process_collection_records:
for item in process_collection_records:
current_delay = float(item.get("data_last_lag_seen", 0))
data_override_lagging_class = item.get(
"data_override_lagging_class", "true"
)
allow_adaptive_delay = item.get("allow_adaptive_delay", "true")
anomaly_reason = item.get("anomaly_reason")
# turn as a list from pipe seperated string, if not already a list
if isinstance(anomaly_reason, str):
anomaly_reason = anomaly_reason.split("|")
if (
item.get("monitored_state") == "enabled"
and item.get("object_state") == "red"
and "delay_threshold_breached" in anomaly_reason
and current_delay > float(min_delay_sec)
and current_delay <= float(self.max_auto_delay_sec)
and data_override_lagging_class != "true"
and allow_adaptive_delay == "true"
):
collection_records.append(item)
collection_records_dict[item.get("_key")] = {
"object": item.get("object"),
"current_max_lag_event_sec": item.get(
"data_max_delay_allowed"
),
}
count_to_process_list.append(item.get("object"))
skip_tracker += 5000
else:
end = True
return collection_records, collection_records_dict, count_to_process_list
def get_recent_activity_item(
self,
item,
collection_records_dict,
count_to_process_list,
collection_records,
object_processed_past30days_threshold_increased,
object_processed_past30days_threshold_decreased,
object_processed_past15days_threshold_increased,
object_processed_past15days_threshold_decreased,
object_processed_past7days_threshold_increased,
object_processed_past7days_threshold_decreased,
object_processed_past24hours_threshold_increased,
object_processed_past24hours_threshold_decreased,
object_processed_past4hours_threshold_increased,
object_processed_past4hours_threshold_decreased,
object_processed_past4hours,
object_processed_past24hours,
object_processed_past7days,
object_processed_past15days,
object_processed_past30days,
):
"""
Processes a single item from recent activity results and updates various lists and dictionaries accordingly.
:param self: The instance of the class where this function is used.
:param item: A dictionary representing a single record from recent activity results.
:param object_summary_dict: Dictionary to store summary of objects.
:param collection_records_dict: Dictionary to store collection records.
:param count_to_process_list: List to store counts of objects to process.
:param collection_records: List to store collection records.
:param object_processed_past30days_threshold_increased: List to store objects processed in the past 30 days with increased threshold.
:param object_processed_past30days_threshold_decreased: List to store objects processed in the past 30 days with decreased threshold.
:param object_processed_past15days_threshold_increased: List to store objects processed in the past 15 days with increased threshold.
:param object_processed_past15days_threshold_decreased: List to store objects processed in the past 15 days with decreased threshold.
:param object_processed_past7days_threshold_increased: List to store objects processed in the past 7 days with increased threshold.
:param object_processed_past7days_threshold_decreased: List to store objects processed in the past 7 days with decreased threshold.
:param object_processed_past24hours_threshold_increased: List to store objects processed in the past 24 hours with increased threshold.
:param object_processed_past24hours_threshold_decreased: List to store objects processed in the past 24 hours with decreased threshold.
:param object_processed_past4hours_threshold_increased: List to store objects processed in the past 4 hours with increased threshold.
:param object_processed_past4hours_threshold_decreased: List to store objects processed in the past 4 hours with decreased threshold.
:param object_processed_past4hours: List to store objects processed in the past 4 hours.
:param object_processed_past24hours: List to store objects processed in the past 24 hours.
:param object_processed_past7days: List to store objects processed in the past 7 days.
:param object_processed_past15days: List to store objects processed in the past 15 days.
:param object_processed_past30days: List to store objects processed in the past 30 days.
"""
object_summary_dict = {}
# Extracting information from the item
object_key = item.get("key")
object_value = item.get("object")
current_max_lag_event_sec = item.get("current_max_lag_event_sec")
object_summary_dict["current_max_lag_event_sec"] = current_max_lag_event_sec
# Processing past 7 days changes
past7days_changes_count = int(item.get("past7days_changes_count", 0))
object_summary_dict["past7days_changes_count"] = past7days_changes_count
# Process past 15 days changes
past15days_changes_count = int(item.get("past15days_changes_count", 0))
object_summary_dict["past7days_changes_count"] = past15days_changes_count
# Process past 30 days changes
past30days_changes_count = int(item.get("past30days_changes_count", 0))
object_summary_dict["past7days_changes_count"] = past30days_changes_count
# Processing status flags
processed_past30days = item.get("processed_past30days")
object_summary_dict["processed_past30days"] = processed_past30days
processed_past15days = item.get("processed_past15days")
object_summary_dict["processed_past15days"] = processed_past15days
processed_past7days = item.get("processed_past7days")
object_summary_dict["processed_past7days"] = processed_past7days
processed_past24hours = item.get("processed_past24hours")
object_summary_dict["processed_past24hours"] = processed_past24hours
processed_past4hours = item.get("processed_past4hours")
object_summary_dict["processed_past4hours"] = processed_past4hours
# Processing threshold changes
increased_past30days = item.get("increased_past30days")
object_summary_dict["increased_past30days"] = increased_past30days
decreased_past30days = item.get("decreased_past30days")
object_summary_dict["decreased_past30days"] = decreased_past30days
increased_past15days = item.get("increased_past15days")
object_summary_dict["increased_past15days"] = increased_past15days
decreased_past15days = item.get("decreased_past15days")
object_summary_dict["decreased_past15days"] = decreased_past15days
increased_past7days = item.get("increased_past7days")
object_summary_dict["increased_past7days"] = increased_past7days
decreased_past7days = item.get("decreased_past7days")
object_summary_dict["decreased_past7days"] = decreased_past7days
increased_past24hours = item.get("increased_past24hours")
object_summary_dict["increased_past24hours"] = increased_past24hours
decreased_past24hours = item.get("decreased_past24hours")
object_summary_dict["decreased_past24hours"] = decreased_past24hours
increased_past4hours = item.get("increased_past4hours")
object_summary_dict["increased_past4hours"] = increased_past4hours
decreased_past4hours = item.get("decreased_past4hours")
object_summary_dict["decreased_past4hours"] = decreased_past4hours
# Adding to lists based on conditions
if increased_past30days == "true":
object_processed_past30days_threshold_increased.append(object_value)
if decreased_past30days == "true":
object_processed_past30days_threshold_decreased.append(object_value)
if increased_past15days == "true":
object_processed_past15days_threshold_increased.append(object_value)
if decreased_past15days == "true":
object_processed_past15days_threshold_decreased.append(object_value)
if increased_past7days == "true":
object_processed_past7days_threshold_increased.append(object_value)
if decreased_past7days == "true":
object_processed_past7days_threshold_decreased.append(object_value)
if increased_past24hours == "true":
object_processed_past24hours_threshold_increased.append(object_value)
if decreased_past24hours == "true":
object_processed_past24hours_threshold_decreased.append(object_value)
if increased_past4hours == "true":
object_processed_past4hours_threshold_increased.append(object_value)
if decreased_past4hours == "true":
object_processed_past4hours_threshold_decreased.append(object_value)
if processed_past4hours == "true":
object_processed_past4hours.append(object_value)
if processed_past24hours == "true":
object_processed_past24hours.append(object_value)
if processed_past30days == "true":
object_processed_past30days.append(object_value)
if object_key not in collection_records_dict:
logging.info(
f'tenant_id="{self.tenant_id}", object="{object_value}", recent activity inspection, this object was inspected in the past 30 days, adding for this object for review if conditions are met.'
)
collection_records_dict[object_key] = {
"object": object_value,
"current_max_lag_event_sec": current_max_lag_event_sec,
}
count_to_process_list.append(object_value)
collection_records.append(item)
if processed_past15days == "true":
object_processed_past15days.append(object_value)
if object_key not in collection_records_dict:
logging.info(
f'tenant_id="{self.tenant_id}", object="{object_value}", recent activity inspection, this object was inspected in the past 15 days, adding for this object for review if conditions are met.'
)
collection_records_dict[object_key] = {
"object": object_value,
"current_max_lag_event_sec": current_max_lag_event_sec,
}
count_to_process_list.append(object_value)
collection_records.append(item)
if processed_past7days == "true":
object_processed_past7days.append(object_value)
if object_key not in collection_records_dict:
logging.info(
f'tenant_id="{self.tenant_id}", object="{object_value}", recent activity inspection, this object was inspected in the past 7 days, adding for this object for review if conditions are met.'
)
collection_records_dict[object_key] = {
"object": object_value,
"current_max_lag_event_sec": current_max_lag_event_sec,
}
count_to_process_list.append(object_value)
collection_records.append(item)
return object_summary_dict
def get_recent_activity_search(self, tenant_audit_idx):
"""
Generates a search string to get the recent activity for a given tenant.
:param tenant_audit_idx: The name of the tenant audit index.
:return: A string containing the search query.
"""
search_string = f"""\
search index={tenant_audit_idx} tenant_id={self.tenant_id} object_category=* "automated adaptive delay update" action="success"
| table _time, tenant_id, object_category, object, action, change_type, comment
| sort - 0 _time | trackmeprettyjson fields=comment
| spath input=comment
| rename results.adaptive_delay as adaptive_delay, results.current_max_lag_event_sec as updated_max_lag_event_sec
``` define the direction of the threshold change ```
| eval direction=case(
adaptive_delay>updated_max_lag_event_sec, "increase",
adaptive_delay<updated_max_lag_event_sec, "decrease",
1=1, "undetermined"
)
``` get latest ```
| stats count as past7days_changes_count, max(_time) as mtime, latest(adaptive_delay) as adaptive_delay, latest(updated_max_lag_event_sec) as updated_max_lag_event_sec, latest(direction) as direction, latest(comment) as comment by tenant_id, object_category, object
``` lookup KV ```
| lookup trackme_dsm_tenant_{self.tenant_id} object OUTPUT _key as key, monitored_state, allow_adaptive_delay, data_max_delay_allowed as current_max_lag_event_sec
``` filter out ```
| where monitored_state="enabled" AND allow_adaptive_delay="true" AND isnotnull(key) AND isnotnull(current_max_lag_event_sec)
``` calculated time of inspection ```
| eval time_since_inspection=now()-mtime
``` define if processed within the past 30 days, 15 days, 7 days, past 24 hours, past 4 hours ```
| eval processed_past30days=if(time_since_inspection<2592000, "true", "false")
| eval processed_past15days=if(time_since_inspection<1296000, "true", "false")
| eval processed_past7days=if(time_since_inspection<604800, "true", "false")
| eval processed_past24hours=if(time_since_inspection<86400, "true", "false")
| eval processed_past4hours=if(time_since_inspection<14400, "true", "false")
``` define if threshold was increased/decreased in the past 30 days ```
| eval increased_past30days=if(processed_past30days=="true" AND direction=="increase", "true", "false")
| eval decreased_past30days=if(processed_past30days=="true" AND direction=="decrease", "true", "false")
``` define if threshold was increased/decreased in the past 15 days ```
| eval increased_past15days=if(processed_past15days=="true" AND direction=="increase", "true", "false")
| eval decreased_past15days=if(processed_past15days=="true" AND direction=="decrease", "true", "false")
``` define if threshold was increased/decreased in the past 7 days ```
| eval increased_past7days=if(processed_past7days=="true" AND direction=="increase", "true", "false")
| eval decreased_past7days=if(processed_past7days=="true" AND direction=="decrease", "true", "false")
``` define if threshold was increased/decreased in the past 24 hours ```
| eval increased_past24hours=if(processed_past24hours=="true" AND direction=="increase", "true", "false")
| eval decreased_past24hours=if(processed_past24hours=="true" AND direction=="decrease", "true", "false")
``` define if threshold was increased/decreased in the past 4 hours ```
| eval increased_past4hours=if(processed_past4hours=="true" AND direction=="increase", "true", "false")
| eval decreased_past4hours=if(processed_past4hours=="true" AND direction=="decrease", "true", "false")
``` final ```
| dedup object
| fields key, object, current_max_lag_event_sec, updated_max_lag_event_sec, adaptive_delay, mtime, time_since_inspection, past30days_changes_count, processed_past30days, past15days_changes_count, processed_past15days, past7days_changes_count, processed_past7days, processed_past24hours, processed_past4hours, increased_past30days, decreased_past30days, increased_past15days, decreased_past15days, increased_past7days, decreased_past7days, increased_past24hours, decreased_past24hours, increased_past4hours, decreased_past4hours, direction, comment
"""
return search_string
def get_ml_condidence_search(self, object_name):
"""
Generates a search string to get the confidence level for a given object.
:param object_name: The name of the object for which to generate the search string.
:return: A string containing the search query.
"""
search_string = f"""\
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" by object span=1d
| stats min(_time) as first_time by object
| eval metrics_duration=now()-first_time
| eval confidence=if(metrics_duration<({self.min_historical_metrics_days}*86400), "low", "normal")
| eval metrics_duration=tostring(metrics_duration, "duration")
| head 1
"""
return search_string
def get_sla_percentage_search(self, object_id):
"""
Generates a search string to get the SLA percentage for a given object.
:param object_id: The id of the object for which to generate the search string.
:return: A string containing the search query.
"""
search_string = f"""\
| `trackme_get_sla_pct_metrics_per_entity_key({self.tenant_id},splk-{self.component},{object_id})`
"""
return search_string
def get_mstats_ml_advanced_search(self, object_name):
"""
Generates an advanced mstats machine learning search string for a given object.
:param object_name: The name of the object for which to generate the search string.
:return: A string containing the advanced mstats ML search query.
"""
search_string = f"""\
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" earliest="-30d" latest="now" by object span=5m
``` ML calculations for this object ```
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
| fields _time object lag_event_sec LowerBound UpperBound
``` retain the UpperBound and perform additional calculations ```
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
``` round by the hour, and go at the next hour range ```
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
``` rename ```
| rename LowerBound as LowerBound_30d, UpperBound as UpperBound_30d, perc95_lag_event_sec as perc95_lag_event_sec_30d, min_lag_event_sec as min_lag_event_sec_30d, max_lag_event_sec as max_lag_event_sec_30d, stdev_lag_event_sec as stdev_lag_event_sec_30d, adaptive_delay as adaptive_delay_30d, adaptive_delay_duration as adaptive_delay_duration_30d
| join type=outer object [
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" earliest="-7d" latest="now" by object span=5m
``` ML calculations for this object ```
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
| fields _time object lag_event_sec LowerBound UpperBound
``` retain the UpperBound and perform additional calculations ```
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
``` round by the hour, and go at the next hour range ```
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
``` rename ```
| rename LowerBound as LowerBound_7d, UpperBound as UpperBound_7d, perc95_lag_event_sec as perc95_lag_event_sec_7d, min_lag_event_sec as min_lag_event_sec_7d, max_lag_event_sec as max_lag_event_sec_7d, stdev_lag_event_sec as stdev_lag_event_sec_7d, adaptive_delay as adaptive_delay_7d, adaptive_delay_duration as adaptive_delay_duration_7d
]
| join type=outer object [
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" earliest="-24h" latest="now" by object span=5m
``` ML calculations for this object ```
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
| fields _time object lag_event_sec LowerBound UpperBound
``` retain the UpperBound and perform additional calculations ```
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
``` round by the hour, and go at the next hour range ```
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
``` rename ```
| rename LowerBound as LowerBound_24h, UpperBound as UpperBound_24h, perc95_lag_event_sec as perc95_lag_event_sec_24h, min_lag_event_sec as min_lag_event_sec_24h, max_lag_event_sec as max_lag_event_sec_24h, stdev_lag_event_sec as stdev_lag_event_sec_24h, adaptive_delay as adaptive_delay_24h, adaptive_delay_duration as adaptive_delay_duration_24h
]
``` aggregate the UpperBound, if for any reason one the UpperBound is not returned as expected, we will use the 7d value ```
| eval UpperBound=case(
isnum(UpperBound_30d) AND isnum(UpperBound_7d) AND isnum(UpperBound_24h), round((UpperBound_30d+UpperBound_7d+UpperBound_24h)/3, 2),
1=1, UpperBound_7d
)
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
``` only consider results with a valid numerical adaptive_delay ```
| where isnum(adaptive_delay)
"""
return search_string
def get_mstats_ml_simple_search(self, object_name):
"""
Generates a simple mstats machine learning search string for a given object.
:param object_name: The name of the object for which to generate the search string.
:return: A string containing the simple mstats ML search query.
"""
search_string = f"""\
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" by object span=5m
``` ML calculations for this object ```
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
| fields _time object lag_event_sec LowerBound UpperBound
``` retain the UpperBound and perform additional calculations ```
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
``` round by the hour, and go at the next hour range ```
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
``` only consider results with a valid numerical adaptive_delay ```
| where isnum(adaptive_delay)
"""
return search_string
def construct_url_for_lag_policy_update(self):
"""
Constructs the URL for updating the lag policy based on the component.
:return: URL string.
"""
if self.component == "dsm":
return (
"%s/services/trackme/v2/splk_dsm/write/ds_update_lag_policy"
% self._metadata.searchinfo.splunkd_uri
)
elif self.component == "dhm":
return (
"%s/services/trackme/v2/splk_dhm/write/dh_update_lag_policy"
% self._metadata.searchinfo.splunkd_uri
)
else:
# Handle other components or raise an error
raise ValueError("Invalid component type")
def run_post_api_call(
self,
entity_dict,
header,
max_auto_delay_sec,
count_updated,
count_failed,
count_updated_list,
count_updated_msg_list,
count_failed_list,
count_processed,
count_processed_list,
count_processed_msg_list,
count_failed_msg_list,
):
"""
Runs a POST API call to update the lag policy for a given entity.
:param entity_dict: Dictionary containing the entity details.
:param header: Authorization header for the request.
:param max_auto_delay_sec: Maximum allowed delay for checks.
:param count_updated: Counter for successful updates.
:param count_failed: Counter for failed updates.
:param count_updated_list: List to keep track of updated entities.
:param count_updated_msg_list: List to keep track of updated messages.
:param count_failed_list: List to keep track of failed entities.
:param count_processed: Counter for processed entities.
:param count_processed_list: List to keep track of processed entities.
:param count_processed_msg_list: List to keep track of processed messages.
:param count_failed_msg_list: List to keep track of failure messages.
:return: Updated counters and lists.
"""
entity_name = entity_dict.get("object")
adaptive_delay = float(entity_dict.get("adaptive_delay"))
current_max_lag_event_sec = float(entity_dict.get("current_max_lag_event_sec"))
# Proceed only if adaptive_delay != current_max_lag_event_sec
if adaptive_delay == current_max_lag_event_sec:
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", adaptive_delay="{adaptive_delay}", current_max_lag_event_sec="{current_max_lag_event_sec}", no need to update the lag policy as it already defined to the target value'
logging.info(log_msg)
count_processed += 1
count_processed_list.append(entity_name)
count_processed_msg_list.append(log_msg)
return (
count_updated,
count_failed,
count_updated_list,
count_updated_msg_list,
count_failed_list,
count_processed,
count_processed_list,
count_processed_msg_list,
count_failed_msg_list,
)
# If the adaptive_delay is bigger than the max_auto_delay_sec, the adaptive_delay will be set to the max_auto_delay_sec
elif adaptive_delay > int(max_auto_delay_sec):
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", adaptive_delay="{adaptive_delay}", current_max_lag_event_sec="{current_max_lag_event_sec}", max_auto_delay_sec={max_auto_delay_sec} has been reached while performing the delay calculation, will be applying the max allowed delay instead.'
logging.info(log_msg)
adaptive_delay = int(max_auto_delay_sec)
# Construct URL based on component
url = self.construct_url_for_lag_policy_update()
# Prepare data for the POST request
update_comment_json = {
"context": "automated adaptive delay update",
"results": entity_dict,
}
data = {
"tenant_id": self.tenant_id,
"object_list": entity_name,
"data_max_delay_allowed": adaptive_delay,
"update_comment": json.dumps(update_comment_json, indent=0),
}
# Make the POST request and handle response
try:
response = requests.post(
url,
headers={
"Authorization": header,
"Content-Type": "application/json",
},
data=json.dumps(data),
verify=False,
timeout=600,
)
if response.status_code not in (200, 201, 204):
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", updating lag policy has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
logging.error(log_msg)
count_failed += 1
count_failed_list.append(entity_name)
count_failed_msg_list.append(log_msg)
else:
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", lag policy updated successfully, adaptive_delay="{adaptive_delay}", response.status_code="{response.status_code}"'
logging.info(log_msg)
count_processed += 1
count_processed_list.append(entity_name)
count_processed_msg_list.append(log_msg)
count_updated += 1
count_updated_list.append(entity_name)
count_updated_msg_list.append(log_msg)
except Exception as e:
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", updating lag policy has failed, exception="{str(e)}"'
logging.error(log_msg)
count_failed += 1
count_failed_list.append(entity_name)
count_failed_msg_list.append(log_msg)
return (
count_updated,
count_failed,
count_updated_list,
count_updated_msg_list,
count_failed_list,
count_processed,
count_processed_list,
count_processed_msg_list,
count_failed_msg_list,
)
def call_component_register(self, action_result, action_message, run_time):
"""
Call the component register function
:param action_result: The result of the action, success or failure
:param action_message: The message to be displayed in the action
:param run_time: The time it took to run the action
:return: None
"""
trackme_register_tenant_object_summary(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
f"splk-{self.component}",
f"trackme_{self.component}_adaptive_delay_tracker_tenant_{self.tenant_id}",
action_result,
time.time(),
run_time,
action_message,
"-5m",
"now",
)
def generate(self, **kwargs):
if self:
# Track execution times
execution_times = []
average_execution_time = 0
# performance counter
start = time.time()
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
)
log.setLevel(reqinfo["logging_level"])
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", trackmesplkadaptivedelay is starting now.'
)
# max runtime
max_runtime = int(self.max_runtime)
# Retrieve the search cron schedule
savedsearch_name = f"trackme_{self.component}_adaptive_delay_tracker_tenant_{self.tenant_id}"
savedsearch = self.service.saved_searches[savedsearch_name]
savedsearch_cron_schedule = savedsearch.content["cron_schedule"]
# get the cron_exec_sequence_sec
try:
cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"'
)
cron_exec_sequence_sec = max_runtime
# the max_runtime cannot be bigger than the cron_exec_sequence_sec
if max_runtime > cron_exec_sequence_sec:
max_runtime = cron_exec_sequence_sec
logging.info(
f'max_runtime="{max_runtime}", savedsearch_name="{savedsearch_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
)
# Get tenant indexes
tenant_indexes = trackme_idx_for_tenant(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
tenant_audit_idx = tenant_indexes.get("trackme_audit_idx", "trackme_audit")
# Get the session key
session_key = self._metadata.searchinfo.session_key
# Get the vtenant account
vtenant_account = trackme_vtenant_account(
session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id
)
adaptive_delay_enabled = int(vtenant_account.get("adaptive_delay", 1))
# if adaptive_delay_enabled is not enabled, we will skip the execution, log the information and exit immediately
if adaptive_delay_enabled == 0:
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", adaptive_delay is disabled for this tenant, skipping execution'
)
yield_results = {
"action": "success",
"tenant_id": self.tenant_id,
"component": self.component,
"msg": "adaptive_delay is disabled for this tenant, skipping execution",
}
yield {
"_time": time.time(),
"_raw": yield_results,
}
return
# Add the session_key to the reqinfo
reqinfo["session_key"] = session_key
# Splunk header
header = f"Splunk {session_key}"
# Data collection
collection_name = f"kv_trackme_{self.component}_tenant_{self.tenant_id}"
collection = self.service.kvstore[collection_name]
# get all records
(
collection_records,
collection_records_dict,
count_to_process_list,
) = self.get_collection_records(collection, self.min_delay_sec)
logging.debug(
f'retrieving records to be processed, collection_records="{json.dumps(collection_records, indent=2)}"'
)
"""
Logic description:
- First, we select entities that are monitored, red, have breached the delay threshold and have a current delay bigger than the min_delay_sec
- We then exclude entities that have data_override_lagging_class=true and data_allow_adaptive_delay=true
- We then exclude entities that have been processed in the past 24 hours
- We process to a ML confidence inspection, if the confidence is low, we will skip the entity, if the entity has been processed in the past 24 hours, we will skip the entity
- If the entity has been processed in the past 7 days, we will run the ML search with a restricted time range of 7 days to review if the behaviour has changed
"""
# A list to store object processed in the past 30 days prior to -1d
object_processed_past30days = []
# A list to store object processed in the past 15 days prior to -1d
object_processed_past15days = []
# A list to store object processed in the past 7 days prior to -1d
object_processed_past7days = []
# A list to store object processed in the past 24 hours
object_processed_past24hours = []
# A list to store object processed in the past 4 hours
object_processed_past4hours = []
# A list to store object processed in the past 15 days and where the threshold was increased
object_processed_past15days_threshold_increased = []
# A list to store object processed in the past 15 days and where the threshold was decreased
object_processed_past15days_threshold_decreased = []
# A list to store object processed in the past 30 days and where the threshold was increased
object_processed_past30days_threshold_increased = []
# A list to store object processed in the past 30 days and where the threshold was decreased
object_processed_past30days_threshold_decreased = []
# A list to store object processed in the past 7 days and where the threshold was increased
object_processed_past7days_threshold_increased = []
# A list to store object processed in the past 7 days and where the threshold was decreased
object_processed_past7days_threshold_decreased = []
# A list to store object processed in the past 24 hours and where the threshold was increased
object_processed_past24hours_threshold_increased = []
# A list to store object processed in the past 24 hours and where the threshold was decreased
object_processed_past24hours_threshold_decreased = []
# A list to store object processed in the past 4 hours and where the threshold was increased
object_processed_past4hours_threshold_increased = []
# A list to store object processed in the past 4 hours and where the threshold was decreased
object_processed_past4hours_threshold_decreased = []
# An interger counter of the number of changes performed during the past 7 days for each object
past7days_changes_count = 0
# An object summary dict
object_summary_dict = {}
#
# 0. Check in our logs, identify entities we have recently managed to verify if the status has changed and should be updated
# - entities processed in the last past 24 hours are added to a special list for further exclusion
# - entities processed in the last past 7 days are added to a special list for review processing
# - entities processed in the last past 15 days are added to a special list for review processing
# - entities processed in the last past 30 days are added to a special list for review processing
#
# kwargs
kwargs_recent_activity = {
"earliest_time": "-31d",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
# conditionally add the earliest_time
if int(self.review_period_no_days) == 7:
kwargs_recent_activity["earliest_time"] = "-8d"
elif int(self.review_period_no_days) == 15:
kwargs_recent_activity["earliest_time"] = "-16d"
elif int(self.review_period_no_days) == 30:
kwargs_recent_activity["earliest_time"] = "-31d"
recent_activity_search = remove_leading_spaces(
self.get_recent_activity_search(tenant_audit_idx)
)
# log
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", recent activity inspection, recent_activity_search="{recent_activity_search}", kwargs="{json.dumps(kwargs_recent_activity, indent=0)}"'
)
try:
search_start = time.time()
reader = run_splunk_search(
self.service,
recent_activity_search,
kwargs_recent_activity,
24,
5,
)
for item in reader:
if isinstance(item, dict):
object_summary_dict = self.get_recent_activity_item(
item,
collection_records_dict,
count_to_process_list,
collection_records,
object_processed_past30days_threshold_increased,
object_processed_past30days_threshold_decreased,
object_processed_past15days_threshold_increased,
object_processed_past15days_threshold_decreased,
object_processed_past7days_threshold_increased,
object_processed_past7days_threshold_decreased,
object_processed_past24hours_threshold_increased,
object_processed_past24hours_threshold_decreased,
object_processed_past4hours_threshold_increased,
object_processed_past4hours_threshold_decreased,
object_processed_past4hours,
object_processed_past24hours,
object_processed_past7days,
object_processed_past15days,
object_processed_past30days,
)
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", Processing results from recent_activity_results, result="{json.dumps(item, indent=2)}"'
)
except Exception as e:
logging.error(f"Failed to execute Splunk search with error: {str(e)}")
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", recent activity search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
logging.error(msg)
raise Exception(msg)
#
# 1. If we have entities to manage, loop though entities, run an mstats search and use ML dentisy function to define the adaptive_delay value
# Store results in a dict which will be used to update the KVstore calling the API endpoint
#
# if we have entities to be managed
# create a results dict
adaptive_delay_results = {}
# debug
logging.debug(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", before processing, our collection_records_dict is: {json.dumps(collection_records_dict, indent=2)}'
)
# counters for pending, we will store and render these for additional context
count_pending = 0
count_pending_list = []
count_pending_msg_list = []
# Initialize sum of execution times and count of iterations
total_execution_time = 0
iteration_count = 0
# Other initializations
max_runtime = int(self.max_runtime)
if len(collection_records) != 0:
for object_id in collection_records_dict:
# iteration start
iteration_start_time = time.time()
object_name = collection_records_dict.get(object_id).get("object")
# log
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", adaptive delay inspection, we will proceed to ML calculations for this entity'
)
# get current_max_lag_event_sec
object_current_max_lag_event_sec = collection_records_dict.get(
object_id
).get("current_max_lag_event_sec")
#
# Confidence: Verify if we have enough historical metrics to proceed
#
# boolean to defined if ML confidence check is passed
ml_confidence_check_passed = False
# initiate to low
ml_confidence = "low"
# initiate to unknown
ml_metrics_duration = "unknown"
# If the entity has been processed in the past 7 days, ML confidence check is passed already
if object_name in object_processed_past7days:
ml_confidence_check_passed = True
ml_confidence = "normal"
ml_confidence_reason = f"ML confidence is passed as this entity was processed in the past 7 days."
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}"'
)
# verify ML confidence
else:
# kwargs
kwargs_confidence = {
"earliest_time": "-30d",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
ml_confidence_search = remove_leading_spaces(
self.get_ml_condidence_search(object_name)
)
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence_search="{ml_confidence_search}"'
)
try:
search_start = time.time()
reader = run_splunk_search(
self.service,
ml_confidence_search,
kwargs_confidence,
24,
5,
)
for item in reader:
if isinstance(item, dict):
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", Processing results from ML confidence inspection, result="{json.dumps(item, indent=2)}"'
)
# log
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection results, ml_confidence="{item.get("confidence")}", metrics_duration="{item.get("metrics_duration")}"'
)
ml_confidence = item.get("confidence", "low")
ml_metrics_duration = item.get(
"metrics_duration", "unknown"
)
except Exception as e:
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
logging.error(msg)
raise Exception(msg)
# set the ml_confidence_reason
if ml_confidence == "low":
ml_confidence_check_passed = False
ml_confidence_reason = f"ML has insufficient historical metrics to proceed (metrics_duration={ml_metrics_duration}, required={self.min_historical_metrics_days} days)"
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}", we will wait for confidence to be normal before proceeding this entity'
)
if object_name not in count_pending_list:
count_pending += 1
count_pending_list.append(object_name)
count_pending_msg_list.append(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}", we will wait for confidence to be normal before proceeding this entity'
)
elif ml_confidence == "normal":
ml_confidence_check_passed = True
ml_confidence_reason = f'ML has sufficient historical metrics to proceed (metrics_duration="{ml_metrics_duration}", required="{self.min_historical_metrics_days}" days)'
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}", we will proceed this entity'
)
#
# SLA percentage: Verify if the SLA percentage is lower than the max_sla_percentage, if not we will not proceed with this entity
#
# boolean to defined if SLA percentage check is passed, default is True unless proven otherwise
sla_percentage_check_passed = True
sla_percentage = 0
# kwargs
kwargs_sla_percentage = {
"earliest_time": "-90d",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
sla_percentage_search = remove_leading_spaces(
self.get_sla_percentage_search(object_id)
)
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage_search="{sla_percentage_search}"'
)
try:
search_start = time.time()
reader = run_splunk_search(
self.service,
sla_percentage_search,
kwargs_sla_percentage,
24,
5,
)
for item in reader:
if isinstance(item, dict):
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", Processing results from SLA percentage inspection, result="{json.dumps(item, indent=2)}"'
)
sla_percentage = float(item.get("percent_sla", 100))
# log
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection results, sla_percentage="{item.get("sla_percentage")}"'
)
except Exception as e:
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
logging.error(msg)
raise Exception(msg)
# set the sla_percentage_check_passed and reason
if sla_percentage > int(self.max_sla_percentage):
sla_percentage_check_passed = False
sla_percentage_reason = f"SLA percentage {sla_percentage} is greater than the max_sla_percentage {self.max_sla_percentage}, we will not proceed with this entity"
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage="{sla_percentage}", sla_percentage_reason="{sla_percentage_reason}", we will not proceed with this entity'
)
if object_name not in count_pending_list:
count_pending += 1
count_pending_list.append(object_name)
count_pending_msg_list.append(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage="{sla_percentage}", sla_percentage_reason="{sla_percentage_reason}", we will not proceed with this entity'
)
else:
sla_percentage_check_passed = True
sla_percentage_reason = f"SLA percentage {sla_percentage} is lower than the max_sla_percentage {self.max_sla_percentage}, we will proceed with this entity"
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage="{sla_percentage}", sla_percentage_reason="{sla_percentage_reason}", we will proceed this entity'
)
#
# Proceed ML investigations
#
# boolean proceed investigations (True by default)
proceed_investigations = True
# If updated in the past 4 hours, we will wait whatever the direction of the change and other conditions
if object_name in object_processed_past4hours:
proceed_investigations = False
count_pending += 1
count_pending_list.append(object_name)
count_pending_msg_list.append(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has been updated in the past 4 hours, we will wait before processing this entity again.'
)
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has been updated in the past 4 hours, we will wait before processing this entity again.'
)
# else if updated in the past 24 hours and the threshold was increased in the past 24 hours, we will review
elif (
object_name in object_processed_past24hours_threshold_increased
and past7days_changes_count < int(self.max_changes_past_7days)
):
proceed_investigations = True
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has been updated in the past 24 hours and the threshold was increased, we will review this entity again.'
)
# else if we have reached the number of changes allowed for a 7 days time frame, we will wait
elif past7days_changes_count > int(self.max_changes_past_7days):
proceed_investigations = False
count_pending += 1
count_pending_list.append(object_name)
count_pending_msg_list.append(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has reached the number of changes allowed for a 7 days time frame, we will wait before processing this entity again.'
)
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has reached the number of changes allowed for a 7 days time frame, we will wait before processing this entity again.'
)
else:
# proceed if ml confidence check is passed
if (
ml_confidence_check_passed == True
and sla_percentage_check_passed == True
):
proceed_investigations = True
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", conditions are met for this entity to be processed.'
)
else:
proceed_investigations = False
#
# Proceed to ML investigations
#
if (
proceed_investigations
and ml_confidence_check_passed
and sla_percentage_check_passed
):
# kwargs
kwargs_ml_mstats = {
"earliest_time": self.earliest_time_mstats,
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
# search the search string
# if object has been processed in the past 7 days, we will run a more complex adaptive logic
if object_name in object_processed_past7days:
ml_mstats_search = self.get_mstats_ml_advanced_search(
object_name
)
else:
ml_mstats_search = self.get_mstats_ml_simple_search(
object_name
)
# set a version of the search but remove carriage returns for logging purposes
ml_mstats_search_for_logging = remove_leading_spaces(
ml_mstats_search
)
# remove any carriage returns
ml_mstats_search_for_logging = (
ml_mstats_search_for_logging.replace("\n", " ")
)
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", running mstats search_string="{remove_leading_spaces(ml_mstats_search)}", kwargs_ml_mstats="{json.dumps(kwargs_ml_mstats, indent=2)}")'
)
try:
search_start = time.time()
reader = run_splunk_search(
self.service,
remove_leading_spaces(ml_mstats_search),
kwargs_ml_mstats,
24,
5,
)
for item in reader:
if isinstance(item, dict):
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", Processing results from ML mstats, result="{json.dumps(item, indent=2)}"'
)
# add per entity results in the dict with the key object
# add all fields returned in item to adaptive_delay_results[object_id]
# init
adaptive_delay_results[object_id] = {}
for k, v in item.items():
adaptive_delay_results[object_id][k] = v
# add current_max_lag_event_sec which is not part of the search results
adaptive_delay_results[object_id][
"current_max_lag_event_sec"
] = object_current_max_lag_event_sec
# add ml_mstats_search_for_logging and kwargs_ml_mstats
adaptive_delay_results[object_id][
"search_string"
] = ml_mstats_search_for_logging
adaptive_delay_results[object_id][
"search_kwargs"
] = kwargs_ml_mstats
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", results="{json.dumps(item, indent=2)}"'
)
except Exception as e:
logging.error(
f"Failed to execute Splunk search with error: {str(e)}"
)
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", ML mstats search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
logging.error(msg)
raise Exception(msg)
# Calculate the execution time for this iteration
iteration_end_time = time.time()
execution_time = iteration_end_time - iteration_start_time
# Update total execution time and iteration count
total_execution_time += execution_time
iteration_count += 1
# Calculate average execution time
if iteration_count > 0:
average_execution_time = total_execution_time / iteration_count
else:
average_execution_time = 0
# Check if there is enough time left to continue
current_time = time.time()
elapsed_time = current_time - start
if elapsed_time + average_execution_time + 120 >= max_runtime:
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
)
break
#
# 2. Loop through the list adaptive_records_results_list and call the API endpoint to update the lag policy
#
logging.debug(
f"adaptive_delay_results={json.dumps(adaptive_delay_results, indent=2)}"
)
count_updated = 0
count_updated_list = []
count_updated_msg_list = []
count_processed = 0
count_processed_list = []
count_processed_msg_list = []
count_failed = 0
count_failed_list = []
count_failed_msg_list = []
for object_id in adaptive_delay_results:
entity_dict = adaptive_delay_results.get(object_id)
(
count_updated,
count_failed,
count_updated_list,
count_updated_msg_list,
count_failed_list,
count_processed,
count_processed_list,
count_processed_msg_list,
count_failed_msg_list,
) = self.run_post_api_call(
entity_dict,
header,
self.max_auto_delay_sec,
count_updated,
count_failed,
count_updated_list,
count_updated_msg_list,
count_failed_list,
count_processed,
count_processed_list,
count_processed_msg_list,
count_failed_msg_list,
)
# action results
if count_failed == 0:
action = "success"
else:
action = "failure"
# set run_time
run_time = round(time.time() - start, 3)
# call the component register
if action == "success":
self.call_component_register(
"success", "The report was executed successfully", run_time
)
else:
self.call_component_register(
"failure", json.dumps(count_failed_msg_list, indent=0), run_time
)
yield_results = {
"action": action,
"tenant_id": self.tenant_id,
"component": self.component,
"count_to_process": len(collection_records),
"count_to_process_list": count_to_process_list,
"count_processed": count_processed,
"count_processed_list": count_processed_list,
"count_processed_msg_list": count_processed_msg_list,
"count_failed": count_failed,
"count_failed_list": count_failed_list,
"count_failed_msg_list": count_failed_msg_list,
"count_updated": count_updated,
"count_updated_list": count_updated_list,
"count_updated_msg_list": count_updated_msg_list,
"count_pending": count_pending,
"count_pending_list": count_pending_list,
"count_pending_msg_list": count_pending_msg_list,
"count_processed_past30days": object_processed_past30days,
"count_processed_past15days": object_processed_past15days,
"count_processed_past7days": object_processed_past7days,
"count_processed_past24hours": object_processed_past24hours,
}
yield {
"_time": time.time(),
"_raw": yield_results,
}
# handler event
handler_events_records = []
for object_name in count_processed_list:
# Find the object_id by looking up in collection_records_dict
object_id = None
for key, value in collection_records_dict.items():
if value.get("object") == object_name:
object_id = key
break
handler_events_records.append(
{
"object": object_name,
"object_id": object_id,
"object_category": f"splk-{self.component}",
"handler": f"trackme_{self.component}_adaptive_delay_tracker_tenant_{self.tenant_id}",
"handler_message": "Entity was processed by the adaptive delay tracker.",
"handler_troubleshoot_search": f'index=_internal (sourcetype=trackme:custom_commands:trackmesplkadaptivedelay) tenant_id={self.tenant_id} object="{object_name}"',
"handler_time": time.time(),
}
)
# notification event
try:
trackme_handler_events(
session_key=self._metadata.searchinfo.session_key,
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
tenant_id=self.tenant_id,
sourcetype="trackme:handler",
source=f"trackme:handler:{self.tenant_id}",
handler_events=handler_events_records,
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"'
)
else:
# set run_time
run_time = round(time.time() - start, 3)
# Call the component register
self.call_component_register(
"success", "The report was executed successfully", run_time
)
yield_results = {
"action": "success",
"tenant_id": self.tenant_id,
"component": self.component,
"count_to_process": len(collection_records),
"msg": "no entities to manage currently",
}
yield {
"_time": time.time(),
"_raw": yield_results,
}
logging.info(
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", trackmesplkadaptivedelay has terminated, run_time={run_time}, results="{json.dumps(yield_results, indent=2)}"'
)
dispatch(AdaptiveDelay, sys.argv, sys.stdin, sys.stdout, __name__)