You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1559 lines
79 KiB
1559 lines
79 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
|
|
# Logging imports
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Networking imports
|
|
import requests
|
|
import urllib3
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_adaptive_delay.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
GeneratingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_reqinfo,
|
|
trackme_register_tenant_object_summary,
|
|
trackme_vtenant_account,
|
|
trackme_idx_for_tenant,
|
|
run_splunk_search,
|
|
trackme_handler_events,
|
|
)
|
|
|
|
# import trackme libs utils
|
|
from trackme_libs_utils import remove_leading_spaces
|
|
|
|
# import trackme libs croniter
|
|
from trackme_libs_croniter import cron_to_seconds
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class AdaptiveDelay(GeneratingCommand):
|
|
tenant_id = Option(
|
|
doc="""
|
|
**Syntax:** **tenant_id=****
|
|
**Description:** The tenant identifier.""",
|
|
require=True,
|
|
default=None,
|
|
)
|
|
|
|
component = Option(
|
|
doc="""
|
|
**Syntax:** **component=****
|
|
**Description:** Specify the TrackMe component.""",
|
|
require=True,
|
|
default=None,
|
|
validate=validators.Match("component", r"^(dsm|dhm)$"),
|
|
)
|
|
|
|
min_delay_sec = Option(
|
|
doc="""
|
|
**Syntax:** **min_delay_sec=<integer>****
|
|
**Description:** The minimal delay value for a given entity to be taken into account, expressed in seconds.""",
|
|
require=False,
|
|
default="3600",
|
|
validate=validators.Match("min_hours_delay", r"^\d*$"),
|
|
)
|
|
|
|
max_auto_delay_sec = Option(
|
|
doc="""
|
|
**Syntax:** **max_auto_delay_sec=<integer>****
|
|
**Description:** The maximal delay value that the adaptive backend can set, if the automated delay calculation goes beyond it, this value will be used instead to set the delay, expressed in seconds.""",
|
|
require=False,
|
|
default="604800",
|
|
validate=validators.Match("max_auto_delay_sec", r"^\d*$"),
|
|
)
|
|
|
|
max_changes_past_7days = Option(
|
|
doc="""
|
|
**Syntax:** **max_changes_past_7days=<integer>****
|
|
**Description:** The maximal number of changes that can be performed in a 7 days time frame, once reached we will not update this entity again until the counter is reset.""",
|
|
require=False,
|
|
default="10",
|
|
validate=validators.Match("max_changes_past_7days", r"^\d*$"),
|
|
)
|
|
|
|
min_historical_metrics_days = Option(
|
|
doc="""
|
|
**Syntax:** **min_historical_metrics_days=<integer>****
|
|
**Description:** The minimal number of accumulated days of metrics before we start updating the delay threshold, expressed in days.""",
|
|
require=False,
|
|
default="7",
|
|
validate=validators.Match("min_historical_metrics_days", r"^\d*$"),
|
|
)
|
|
|
|
max_sla_percentage = Option(
|
|
doc="""
|
|
**Syntax:** **max_sla_percentage=<integer>****
|
|
**Description:** The maximum SLA percentage for entities, if the SLA percentage is greater than this value, the delay threshold will not be updated to avoid updating highly stable entities.""",
|
|
require=False,
|
|
default="90",
|
|
validate=validators.Match("max_sla_percentage", r"^\d*$"),
|
|
)
|
|
|
|
earliest_time_mstats = Option(
|
|
doc="""
|
|
**Syntax:** **earliest_time_mstats=****
|
|
**Description:** The earliest time to use for the mstats search.""",
|
|
require=False,
|
|
default="-30d",
|
|
)
|
|
|
|
max_runtime = Option(
|
|
doc="""
|
|
**Syntax:** **max_runtime=****
|
|
**Description:** The max runtime for the job in seconds, defaults to 15 minutes less 120 seconds of margin.""",
|
|
require=False,
|
|
default="900",
|
|
validate=validators.Match("max_runtime", r"^\d*$"),
|
|
)
|
|
|
|
review_period_no_days = Option(
|
|
doc="""
|
|
**Syntax:** **review_period_no_days=****
|
|
**Description:** The relative time period for review. When entities were updated, TrackMe will review over time the behaviour and eventually adapt the threshold to take into accoount new patterns, expressed in number of days, valid options: 7, 15, 30""",
|
|
require=False,
|
|
default="30",
|
|
validate=validators.Match("review_period_no_days", r"^(7|15|30)$"),
|
|
)
|
|
|
|
def get_collection_records(self, collection, min_delay_sec):
|
|
"""
|
|
Queries and processes records from a collection based on specific criteria.
|
|
|
|
:param collection: The collection object to query.
|
|
:param min_delay_sec: Minimum delay seconds for processing.
|
|
:return: Tuple containing collection records and a dictionary of records.
|
|
"""
|
|
collection_records = []
|
|
collection_records_dict = {}
|
|
count_to_process_list = []
|
|
|
|
end = False
|
|
skip_tracker = 0
|
|
while not end:
|
|
process_collection_records = collection.data.query(skip=skip_tracker)
|
|
if process_collection_records:
|
|
for item in process_collection_records:
|
|
current_delay = float(item.get("data_last_lag_seen", 0))
|
|
data_override_lagging_class = item.get(
|
|
"data_override_lagging_class", "true"
|
|
)
|
|
allow_adaptive_delay = item.get("allow_adaptive_delay", "true")
|
|
anomaly_reason = item.get("anomaly_reason")
|
|
# turn as a list from pipe seperated string, if not already a list
|
|
if isinstance(anomaly_reason, str):
|
|
anomaly_reason = anomaly_reason.split("|")
|
|
|
|
if (
|
|
item.get("monitored_state") == "enabled"
|
|
and item.get("object_state") == "red"
|
|
and "delay_threshold_breached" in anomaly_reason
|
|
and current_delay > float(min_delay_sec)
|
|
and current_delay <= float(self.max_auto_delay_sec)
|
|
and data_override_lagging_class != "true"
|
|
and allow_adaptive_delay == "true"
|
|
):
|
|
collection_records.append(item)
|
|
collection_records_dict[item.get("_key")] = {
|
|
"object": item.get("object"),
|
|
"current_max_lag_event_sec": item.get(
|
|
"data_max_delay_allowed"
|
|
),
|
|
}
|
|
count_to_process_list.append(item.get("object"))
|
|
skip_tracker += 5000
|
|
else:
|
|
end = True
|
|
|
|
return collection_records, collection_records_dict, count_to_process_list
|
|
|
|
def get_recent_activity_item(
|
|
self,
|
|
item,
|
|
collection_records_dict,
|
|
count_to_process_list,
|
|
collection_records,
|
|
object_processed_past30days_threshold_increased,
|
|
object_processed_past30days_threshold_decreased,
|
|
object_processed_past15days_threshold_increased,
|
|
object_processed_past15days_threshold_decreased,
|
|
object_processed_past7days_threshold_increased,
|
|
object_processed_past7days_threshold_decreased,
|
|
object_processed_past24hours_threshold_increased,
|
|
object_processed_past24hours_threshold_decreased,
|
|
object_processed_past4hours_threshold_increased,
|
|
object_processed_past4hours_threshold_decreased,
|
|
object_processed_past4hours,
|
|
object_processed_past24hours,
|
|
object_processed_past7days,
|
|
object_processed_past15days,
|
|
object_processed_past30days,
|
|
):
|
|
"""
|
|
Processes a single item from recent activity results and updates various lists and dictionaries accordingly.
|
|
|
|
:param self: The instance of the class where this function is used.
|
|
:param item: A dictionary representing a single record from recent activity results.
|
|
:param object_summary_dict: Dictionary to store summary of objects.
|
|
:param collection_records_dict: Dictionary to store collection records.
|
|
:param count_to_process_list: List to store counts of objects to process.
|
|
:param collection_records: List to store collection records.
|
|
:param object_processed_past30days_threshold_increased: List to store objects processed in the past 30 days with increased threshold.
|
|
:param object_processed_past30days_threshold_decreased: List to store objects processed in the past 30 days with decreased threshold.
|
|
:param object_processed_past15days_threshold_increased: List to store objects processed in the past 15 days with increased threshold.
|
|
:param object_processed_past15days_threshold_decreased: List to store objects processed in the past 15 days with decreased threshold.
|
|
:param object_processed_past7days_threshold_increased: List to store objects processed in the past 7 days with increased threshold.
|
|
:param object_processed_past7days_threshold_decreased: List to store objects processed in the past 7 days with decreased threshold.
|
|
:param object_processed_past24hours_threshold_increased: List to store objects processed in the past 24 hours with increased threshold.
|
|
:param object_processed_past24hours_threshold_decreased: List to store objects processed in the past 24 hours with decreased threshold.
|
|
:param object_processed_past4hours_threshold_increased: List to store objects processed in the past 4 hours with increased threshold.
|
|
:param object_processed_past4hours_threshold_decreased: List to store objects processed in the past 4 hours with decreased threshold.
|
|
:param object_processed_past4hours: List to store objects processed in the past 4 hours.
|
|
:param object_processed_past24hours: List to store objects processed in the past 24 hours.
|
|
:param object_processed_past7days: List to store objects processed in the past 7 days.
|
|
:param object_processed_past15days: List to store objects processed in the past 15 days.
|
|
:param object_processed_past30days: List to store objects processed in the past 30 days.
|
|
"""
|
|
|
|
object_summary_dict = {}
|
|
|
|
# Extracting information from the item
|
|
object_key = item.get("key")
|
|
object_value = item.get("object")
|
|
current_max_lag_event_sec = item.get("current_max_lag_event_sec")
|
|
object_summary_dict["current_max_lag_event_sec"] = current_max_lag_event_sec
|
|
|
|
# Processing past 7 days changes
|
|
past7days_changes_count = int(item.get("past7days_changes_count", 0))
|
|
object_summary_dict["past7days_changes_count"] = past7days_changes_count
|
|
|
|
# Process past 15 days changes
|
|
past15days_changes_count = int(item.get("past15days_changes_count", 0))
|
|
object_summary_dict["past7days_changes_count"] = past15days_changes_count
|
|
|
|
# Process past 30 days changes
|
|
past30days_changes_count = int(item.get("past30days_changes_count", 0))
|
|
object_summary_dict["past7days_changes_count"] = past30days_changes_count
|
|
|
|
# Processing status flags
|
|
processed_past30days = item.get("processed_past30days")
|
|
object_summary_dict["processed_past30days"] = processed_past30days
|
|
|
|
processed_past15days = item.get("processed_past15days")
|
|
object_summary_dict["processed_past15days"] = processed_past15days
|
|
|
|
processed_past7days = item.get("processed_past7days")
|
|
object_summary_dict["processed_past7days"] = processed_past7days
|
|
|
|
processed_past24hours = item.get("processed_past24hours")
|
|
object_summary_dict["processed_past24hours"] = processed_past24hours
|
|
|
|
processed_past4hours = item.get("processed_past4hours")
|
|
object_summary_dict["processed_past4hours"] = processed_past4hours
|
|
|
|
# Processing threshold changes
|
|
increased_past30days = item.get("increased_past30days")
|
|
object_summary_dict["increased_past30days"] = increased_past30days
|
|
decreased_past30days = item.get("decreased_past30days")
|
|
object_summary_dict["decreased_past30days"] = decreased_past30days
|
|
|
|
increased_past15days = item.get("increased_past15days")
|
|
object_summary_dict["increased_past15days"] = increased_past15days
|
|
decreased_past15days = item.get("decreased_past15days")
|
|
object_summary_dict["decreased_past15days"] = decreased_past15days
|
|
|
|
increased_past7days = item.get("increased_past7days")
|
|
object_summary_dict["increased_past7days"] = increased_past7days
|
|
decreased_past7days = item.get("decreased_past7days")
|
|
object_summary_dict["decreased_past7days"] = decreased_past7days
|
|
|
|
increased_past24hours = item.get("increased_past24hours")
|
|
object_summary_dict["increased_past24hours"] = increased_past24hours
|
|
decreased_past24hours = item.get("decreased_past24hours")
|
|
object_summary_dict["decreased_past24hours"] = decreased_past24hours
|
|
|
|
increased_past4hours = item.get("increased_past4hours")
|
|
object_summary_dict["increased_past4hours"] = increased_past4hours
|
|
decreased_past4hours = item.get("decreased_past4hours")
|
|
object_summary_dict["decreased_past4hours"] = decreased_past4hours
|
|
|
|
# Adding to lists based on conditions
|
|
|
|
if increased_past30days == "true":
|
|
object_processed_past30days_threshold_increased.append(object_value)
|
|
if decreased_past30days == "true":
|
|
object_processed_past30days_threshold_decreased.append(object_value)
|
|
|
|
if increased_past15days == "true":
|
|
object_processed_past15days_threshold_increased.append(object_value)
|
|
if decreased_past15days == "true":
|
|
object_processed_past15days_threshold_decreased.append(object_value)
|
|
|
|
if increased_past7days == "true":
|
|
object_processed_past7days_threshold_increased.append(object_value)
|
|
if decreased_past7days == "true":
|
|
object_processed_past7days_threshold_decreased.append(object_value)
|
|
|
|
if increased_past24hours == "true":
|
|
object_processed_past24hours_threshold_increased.append(object_value)
|
|
if decreased_past24hours == "true":
|
|
object_processed_past24hours_threshold_decreased.append(object_value)
|
|
|
|
if increased_past4hours == "true":
|
|
object_processed_past4hours_threshold_increased.append(object_value)
|
|
if decreased_past4hours == "true":
|
|
object_processed_past4hours_threshold_decreased.append(object_value)
|
|
if processed_past4hours == "true":
|
|
object_processed_past4hours.append(object_value)
|
|
|
|
if processed_past24hours == "true":
|
|
object_processed_past24hours.append(object_value)
|
|
|
|
if processed_past30days == "true":
|
|
object_processed_past30days.append(object_value)
|
|
if object_key not in collection_records_dict:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", object="{object_value}", recent activity inspection, this object was inspected in the past 30 days, adding for this object for review if conditions are met.'
|
|
)
|
|
collection_records_dict[object_key] = {
|
|
"object": object_value,
|
|
"current_max_lag_event_sec": current_max_lag_event_sec,
|
|
}
|
|
count_to_process_list.append(object_value)
|
|
collection_records.append(item)
|
|
|
|
if processed_past15days == "true":
|
|
object_processed_past15days.append(object_value)
|
|
if object_key not in collection_records_dict:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", object="{object_value}", recent activity inspection, this object was inspected in the past 15 days, adding for this object for review if conditions are met.'
|
|
)
|
|
collection_records_dict[object_key] = {
|
|
"object": object_value,
|
|
"current_max_lag_event_sec": current_max_lag_event_sec,
|
|
}
|
|
count_to_process_list.append(object_value)
|
|
collection_records.append(item)
|
|
|
|
if processed_past7days == "true":
|
|
object_processed_past7days.append(object_value)
|
|
if object_key not in collection_records_dict:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", object="{object_value}", recent activity inspection, this object was inspected in the past 7 days, adding for this object for review if conditions are met.'
|
|
)
|
|
collection_records_dict[object_key] = {
|
|
"object": object_value,
|
|
"current_max_lag_event_sec": current_max_lag_event_sec,
|
|
}
|
|
count_to_process_list.append(object_value)
|
|
collection_records.append(item)
|
|
|
|
return object_summary_dict
|
|
|
|
def get_recent_activity_search(self, tenant_audit_idx):
|
|
"""
|
|
Generates a search string to get the recent activity for a given tenant.
|
|
|
|
:param tenant_audit_idx: The name of the tenant audit index.
|
|
:return: A string containing the search query.
|
|
|
|
"""
|
|
|
|
search_string = f"""\
|
|
search index={tenant_audit_idx} tenant_id={self.tenant_id} object_category=* "automated adaptive delay update" action="success"
|
|
| table _time, tenant_id, object_category, object, action, change_type, comment
|
|
| sort - 0 _time | trackmeprettyjson fields=comment
|
|
| spath input=comment
|
|
| rename results.adaptive_delay as adaptive_delay, results.current_max_lag_event_sec as updated_max_lag_event_sec
|
|
|
|
``` define the direction of the threshold change ```
|
|
| eval direction=case(
|
|
adaptive_delay>updated_max_lag_event_sec, "increase",
|
|
adaptive_delay<updated_max_lag_event_sec, "decrease",
|
|
1=1, "undetermined"
|
|
)
|
|
|
|
``` get latest ```
|
|
| stats count as past7days_changes_count, max(_time) as mtime, latest(adaptive_delay) as adaptive_delay, latest(updated_max_lag_event_sec) as updated_max_lag_event_sec, latest(direction) as direction, latest(comment) as comment by tenant_id, object_category, object
|
|
|
|
``` lookup KV ```
|
|
| lookup trackme_dsm_tenant_{self.tenant_id} object OUTPUT _key as key, monitored_state, allow_adaptive_delay, data_max_delay_allowed as current_max_lag_event_sec
|
|
|
|
``` filter out ```
|
|
| where monitored_state="enabled" AND allow_adaptive_delay="true" AND isnotnull(key) AND isnotnull(current_max_lag_event_sec)
|
|
|
|
``` calculated time of inspection ```
|
|
| eval time_since_inspection=now()-mtime
|
|
|
|
``` define if processed within the past 30 days, 15 days, 7 days, past 24 hours, past 4 hours ```
|
|
| eval processed_past30days=if(time_since_inspection<2592000, "true", "false")
|
|
| eval processed_past15days=if(time_since_inspection<1296000, "true", "false")
|
|
| eval processed_past7days=if(time_since_inspection<604800, "true", "false")
|
|
| eval processed_past24hours=if(time_since_inspection<86400, "true", "false")
|
|
| eval processed_past4hours=if(time_since_inspection<14400, "true", "false")
|
|
|
|
``` define if threshold was increased/decreased in the past 30 days ```
|
|
| eval increased_past30days=if(processed_past30days=="true" AND direction=="increase", "true", "false")
|
|
| eval decreased_past30days=if(processed_past30days=="true" AND direction=="decrease", "true", "false")
|
|
|
|
``` define if threshold was increased/decreased in the past 15 days ```
|
|
| eval increased_past15days=if(processed_past15days=="true" AND direction=="increase", "true", "false")
|
|
| eval decreased_past15days=if(processed_past15days=="true" AND direction=="decrease", "true", "false")
|
|
|
|
``` define if threshold was increased/decreased in the past 7 days ```
|
|
| eval increased_past7days=if(processed_past7days=="true" AND direction=="increase", "true", "false")
|
|
| eval decreased_past7days=if(processed_past7days=="true" AND direction=="decrease", "true", "false")
|
|
|
|
``` define if threshold was increased/decreased in the past 24 hours ```
|
|
| eval increased_past24hours=if(processed_past24hours=="true" AND direction=="increase", "true", "false")
|
|
| eval decreased_past24hours=if(processed_past24hours=="true" AND direction=="decrease", "true", "false")
|
|
|
|
``` define if threshold was increased/decreased in the past 4 hours ```
|
|
| eval increased_past4hours=if(processed_past4hours=="true" AND direction=="increase", "true", "false")
|
|
| eval decreased_past4hours=if(processed_past4hours=="true" AND direction=="decrease", "true", "false")
|
|
|
|
``` final ```
|
|
| dedup object
|
|
| fields key, object, current_max_lag_event_sec, updated_max_lag_event_sec, adaptive_delay, mtime, time_since_inspection, past30days_changes_count, processed_past30days, past15days_changes_count, processed_past15days, past7days_changes_count, processed_past7days, processed_past24hours, processed_past4hours, increased_past30days, decreased_past30days, increased_past15days, decreased_past15days, increased_past7days, decreased_past7days, increased_past24hours, decreased_past24hours, increased_past4hours, decreased_past4hours, direction, comment
|
|
"""
|
|
|
|
return search_string
|
|
|
|
def get_ml_condidence_search(self, object_name):
|
|
"""
|
|
Generates a search string to get the confidence level for a given object.
|
|
|
|
:param object_name: The name of the object for which to generate the search string.
|
|
:return: A string containing the search query.
|
|
"""
|
|
|
|
search_string = f"""\
|
|
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" by object span=1d
|
|
| stats min(_time) as first_time by object
|
|
| eval metrics_duration=now()-first_time
|
|
| eval confidence=if(metrics_duration<({self.min_historical_metrics_days}*86400), "low", "normal")
|
|
| eval metrics_duration=tostring(metrics_duration, "duration")
|
|
| head 1
|
|
"""
|
|
|
|
return search_string
|
|
|
|
def get_sla_percentage_search(self, object_id):
|
|
"""
|
|
Generates a search string to get the SLA percentage for a given object.
|
|
|
|
:param object_id: The id of the object for which to generate the search string.
|
|
:return: A string containing the search query.
|
|
"""
|
|
|
|
search_string = f"""\
|
|
| `trackme_get_sla_pct_metrics_per_entity_key({self.tenant_id},splk-{self.component},{object_id})`
|
|
"""
|
|
|
|
return search_string
|
|
|
|
def get_mstats_ml_advanced_search(self, object_name):
|
|
"""
|
|
Generates an advanced mstats machine learning search string for a given object.
|
|
|
|
:param object_name: The name of the object for which to generate the search string.
|
|
:return: A string containing the advanced mstats ML search query.
|
|
"""
|
|
|
|
search_string = f"""\
|
|
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" earliest="-30d" latest="now" by object span=5m
|
|
|
|
``` ML calculations for this object ```
|
|
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
|
|
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
|
|
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
|
|
| fields _time object lag_event_sec LowerBound UpperBound
|
|
|
|
``` retain the UpperBound and perform additional calculations ```
|
|
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
|
|
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
|
|
|
|
``` round by the hour, and go at the next hour range ```
|
|
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
|
|
|
|
``` rename ```
|
|
| rename LowerBound as LowerBound_30d, UpperBound as UpperBound_30d, perc95_lag_event_sec as perc95_lag_event_sec_30d, min_lag_event_sec as min_lag_event_sec_30d, max_lag_event_sec as max_lag_event_sec_30d, stdev_lag_event_sec as stdev_lag_event_sec_30d, adaptive_delay as adaptive_delay_30d, adaptive_delay_duration as adaptive_delay_duration_30d
|
|
|
|
| join type=outer object [
|
|
|
|
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" earliest="-7d" latest="now" by object span=5m
|
|
|
|
``` ML calculations for this object ```
|
|
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
|
|
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
|
|
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
|
|
| fields _time object lag_event_sec LowerBound UpperBound
|
|
|
|
``` retain the UpperBound and perform additional calculations ```
|
|
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
|
|
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
|
|
|
|
``` round by the hour, and go at the next hour range ```
|
|
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
|
|
|
|
``` rename ```
|
|
| rename LowerBound as LowerBound_7d, UpperBound as UpperBound_7d, perc95_lag_event_sec as perc95_lag_event_sec_7d, min_lag_event_sec as min_lag_event_sec_7d, max_lag_event_sec as max_lag_event_sec_7d, stdev_lag_event_sec as stdev_lag_event_sec_7d, adaptive_delay as adaptive_delay_7d, adaptive_delay_duration as adaptive_delay_duration_7d
|
|
|
|
]
|
|
|
|
| join type=outer object [
|
|
|
|
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" earliest="-24h" latest="now" by object span=5m
|
|
|
|
``` ML calculations for this object ```
|
|
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
|
|
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
|
|
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
|
|
| fields _time object lag_event_sec LowerBound UpperBound
|
|
|
|
``` retain the UpperBound and perform additional calculations ```
|
|
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
|
|
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
|
|
|
|
``` round by the hour, and go at the next hour range ```
|
|
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
|
|
|
|
``` rename ```
|
|
| rename LowerBound as LowerBound_24h, UpperBound as UpperBound_24h, perc95_lag_event_sec as perc95_lag_event_sec_24h, min_lag_event_sec as min_lag_event_sec_24h, max_lag_event_sec as max_lag_event_sec_24h, stdev_lag_event_sec as stdev_lag_event_sec_24h, adaptive_delay as adaptive_delay_24h, adaptive_delay_duration as adaptive_delay_duration_24h
|
|
|
|
]
|
|
|
|
``` aggregate the UpperBound, if for any reason one the UpperBound is not returned as expected, we will use the 7d value ```
|
|
| eval UpperBound=case(
|
|
isnum(UpperBound_30d) AND isnum(UpperBound_7d) AND isnum(UpperBound_24h), round((UpperBound_30d+UpperBound_7d+UpperBound_24h)/3, 2),
|
|
1=1, UpperBound_7d
|
|
)
|
|
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
|
|
|
|
``` only consider results with a valid numerical adaptive_delay ```
|
|
| where isnum(adaptive_delay)
|
|
"""
|
|
|
|
return search_string
|
|
|
|
def get_mstats_ml_simple_search(self, object_name):
|
|
"""
|
|
Generates a simple mstats machine learning search string for a given object.
|
|
|
|
:param object_name: The name of the object for which to generate the search string.
|
|
:return: A string containing the simple mstats ML search query.
|
|
"""
|
|
|
|
search_string = f"""\
|
|
| mstats latest(trackme.splk.feeds.lag_event_sec) as lag_event_sec where `trackme_metrics_idx({self.tenant_id})` tenant_id="{self.tenant_id}" object_category="splk-{self.component}" object="{object_name}" by object span=5m
|
|
|
|
``` ML calculations for this object ```
|
|
| fit DensityFunction lag_event_sec lower_threshold=0.005 upper_threshold=0.005 by object
|
|
| rex field=BoundaryRanges "(-Infinity:(?<LowerBound>[\\d|\\.]*))|((?<UpperBound>[\\d|\\.]*):Infinity)"
|
|
| foreach LowerBound UpperBound [ eval <<FIELD>> = if(isnum('<<FIELD>>'), '<<FIELD>>', 0) ]
|
|
| fields _time object lag_event_sec LowerBound UpperBound
|
|
|
|
``` retain the UpperBound and perform additional calculations ```
|
|
| stats first(UpperBound) as UpperBound, perc95(lag_event_sec) as perc95_lag_event_sec, min(lag_event_sec) as min_lag_event_sec, max(lag_event_sec) as max_lag_event_sec, stdev(lag_event_sec) as stdev_lag_event_sec by object | eval UpperBound=round(UpperBound, 0)
|
|
| foreach *_lag_event_sec [ eval <<FIELD>> = round('<<FIELD>>', 0) ]
|
|
|
|
``` round by the hour, and go at the next hour range ```
|
|
| eval adaptive_delay = (round(UpperBound/3600, 0) * 3600) + 3600, adaptive_delay_duration = tostring(adaptive_delay, "duration")
|
|
|
|
``` only consider results with a valid numerical adaptive_delay ```
|
|
| where isnum(adaptive_delay)
|
|
"""
|
|
|
|
return search_string
|
|
|
|
def construct_url_for_lag_policy_update(self):
|
|
"""
|
|
Constructs the URL for updating the lag policy based on the component.
|
|
:return: URL string.
|
|
"""
|
|
if self.component == "dsm":
|
|
return (
|
|
"%s/services/trackme/v2/splk_dsm/write/ds_update_lag_policy"
|
|
% self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
elif self.component == "dhm":
|
|
return (
|
|
"%s/services/trackme/v2/splk_dhm/write/dh_update_lag_policy"
|
|
% self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
else:
|
|
# Handle other components or raise an error
|
|
raise ValueError("Invalid component type")
|
|
|
|
def run_post_api_call(
|
|
self,
|
|
entity_dict,
|
|
header,
|
|
max_auto_delay_sec,
|
|
count_updated,
|
|
count_failed,
|
|
count_updated_list,
|
|
count_updated_msg_list,
|
|
count_failed_list,
|
|
count_processed,
|
|
count_processed_list,
|
|
count_processed_msg_list,
|
|
count_failed_msg_list,
|
|
):
|
|
"""
|
|
Runs a POST API call to update the lag policy for a given entity.
|
|
|
|
:param entity_dict: Dictionary containing the entity details.
|
|
:param header: Authorization header for the request.
|
|
:param max_auto_delay_sec: Maximum allowed delay for checks.
|
|
:param count_updated: Counter for successful updates.
|
|
:param count_failed: Counter for failed updates.
|
|
:param count_updated_list: List to keep track of updated entities.
|
|
:param count_updated_msg_list: List to keep track of updated messages.
|
|
:param count_failed_list: List to keep track of failed entities.
|
|
:param count_processed: Counter for processed entities.
|
|
:param count_processed_list: List to keep track of processed entities.
|
|
:param count_processed_msg_list: List to keep track of processed messages.
|
|
:param count_failed_msg_list: List to keep track of failure messages.
|
|
:return: Updated counters and lists.
|
|
"""
|
|
entity_name = entity_dict.get("object")
|
|
adaptive_delay = float(entity_dict.get("adaptive_delay"))
|
|
current_max_lag_event_sec = float(entity_dict.get("current_max_lag_event_sec"))
|
|
|
|
# Proceed only if adaptive_delay != current_max_lag_event_sec
|
|
if adaptive_delay == current_max_lag_event_sec:
|
|
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", adaptive_delay="{adaptive_delay}", current_max_lag_event_sec="{current_max_lag_event_sec}", no need to update the lag policy as it already defined to the target value'
|
|
logging.info(log_msg)
|
|
count_processed += 1
|
|
count_processed_list.append(entity_name)
|
|
count_processed_msg_list.append(log_msg)
|
|
return (
|
|
count_updated,
|
|
count_failed,
|
|
count_updated_list,
|
|
count_updated_msg_list,
|
|
count_failed_list,
|
|
count_processed,
|
|
count_processed_list,
|
|
count_processed_msg_list,
|
|
count_failed_msg_list,
|
|
)
|
|
|
|
# If the adaptive_delay is bigger than the max_auto_delay_sec, the adaptive_delay will be set to the max_auto_delay_sec
|
|
elif adaptive_delay > int(max_auto_delay_sec):
|
|
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", adaptive_delay="{adaptive_delay}", current_max_lag_event_sec="{current_max_lag_event_sec}", max_auto_delay_sec={max_auto_delay_sec} has been reached while performing the delay calculation, will be applying the max allowed delay instead.'
|
|
logging.info(log_msg)
|
|
adaptive_delay = int(max_auto_delay_sec)
|
|
|
|
# Construct URL based on component
|
|
url = self.construct_url_for_lag_policy_update()
|
|
|
|
# Prepare data for the POST request
|
|
update_comment_json = {
|
|
"context": "automated adaptive delay update",
|
|
"results": entity_dict,
|
|
}
|
|
data = {
|
|
"tenant_id": self.tenant_id,
|
|
"object_list": entity_name,
|
|
"data_max_delay_allowed": adaptive_delay,
|
|
"update_comment": json.dumps(update_comment_json, indent=0),
|
|
}
|
|
|
|
# Make the POST request and handle response
|
|
try:
|
|
response = requests.post(
|
|
url,
|
|
headers={
|
|
"Authorization": header,
|
|
"Content-Type": "application/json",
|
|
},
|
|
data=json.dumps(data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
if response.status_code not in (200, 201, 204):
|
|
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", updating lag policy has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
logging.error(log_msg)
|
|
count_failed += 1
|
|
count_failed_list.append(entity_name)
|
|
count_failed_msg_list.append(log_msg)
|
|
else:
|
|
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", lag policy updated successfully, adaptive_delay="{adaptive_delay}", response.status_code="{response.status_code}"'
|
|
logging.info(log_msg)
|
|
count_processed += 1
|
|
count_processed_list.append(entity_name)
|
|
count_processed_msg_list.append(log_msg)
|
|
count_updated += 1
|
|
count_updated_list.append(entity_name)
|
|
count_updated_msg_list.append(log_msg)
|
|
except Exception as e:
|
|
log_msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{entity_name}", updating lag policy has failed, exception="{str(e)}"'
|
|
logging.error(log_msg)
|
|
count_failed += 1
|
|
count_failed_list.append(entity_name)
|
|
count_failed_msg_list.append(log_msg)
|
|
|
|
return (
|
|
count_updated,
|
|
count_failed,
|
|
count_updated_list,
|
|
count_updated_msg_list,
|
|
count_failed_list,
|
|
count_processed,
|
|
count_processed_list,
|
|
count_processed_msg_list,
|
|
count_failed_msg_list,
|
|
)
|
|
|
|
def call_component_register(self, action_result, action_message, run_time):
|
|
"""
|
|
Call the component register function
|
|
|
|
:param action_result: The result of the action, success or failure
|
|
:param action_message: The message to be displayed in the action
|
|
:param run_time: The time it took to run the action
|
|
|
|
:return: None
|
|
"""
|
|
|
|
trackme_register_tenant_object_summary(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
f"splk-{self.component}",
|
|
f"trackme_{self.component}_adaptive_delay_tracker_tenant_{self.tenant_id}",
|
|
action_result,
|
|
time.time(),
|
|
run_time,
|
|
action_message,
|
|
"-5m",
|
|
"now",
|
|
)
|
|
|
|
def generate(self, **kwargs):
|
|
if self:
|
|
|
|
# Track execution times
|
|
execution_times = []
|
|
average_execution_time = 0
|
|
|
|
# performance counter
|
|
start = time.time()
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", trackmesplkadaptivedelay is starting now.'
|
|
)
|
|
|
|
# max runtime
|
|
max_runtime = int(self.max_runtime)
|
|
|
|
# Retrieve the search cron schedule
|
|
savedsearch_name = f"trackme_{self.component}_adaptive_delay_tracker_tenant_{self.tenant_id}"
|
|
savedsearch = self.service.saved_searches[savedsearch_name]
|
|
savedsearch_cron_schedule = savedsearch.content["cron_schedule"]
|
|
|
|
# get the cron_exec_sequence_sec
|
|
try:
|
|
cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"'
|
|
)
|
|
cron_exec_sequence_sec = max_runtime
|
|
|
|
# the max_runtime cannot be bigger than the cron_exec_sequence_sec
|
|
if max_runtime > cron_exec_sequence_sec:
|
|
max_runtime = cron_exec_sequence_sec
|
|
|
|
logging.info(
|
|
f'max_runtime="{max_runtime}", savedsearch_name="{savedsearch_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
|
|
)
|
|
|
|
# Get tenant indexes
|
|
tenant_indexes = trackme_idx_for_tenant(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
tenant_audit_idx = tenant_indexes.get("trackme_audit_idx", "trackme_audit")
|
|
|
|
# Get the session key
|
|
session_key = self._metadata.searchinfo.session_key
|
|
|
|
# Get the vtenant account
|
|
vtenant_account = trackme_vtenant_account(
|
|
session_key, self._metadata.searchinfo.splunkd_uri, self.tenant_id
|
|
)
|
|
adaptive_delay_enabled = int(vtenant_account.get("adaptive_delay", 1))
|
|
|
|
# if adaptive_delay_enabled is not enabled, we will skip the execution, log the information and exit immediately
|
|
if adaptive_delay_enabled == 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", adaptive_delay is disabled for this tenant, skipping execution'
|
|
)
|
|
yield_results = {
|
|
"action": "success",
|
|
"tenant_id": self.tenant_id,
|
|
"component": self.component,
|
|
"msg": "adaptive_delay is disabled for this tenant, skipping execution",
|
|
}
|
|
|
|
yield {
|
|
"_time": time.time(),
|
|
"_raw": yield_results,
|
|
}
|
|
return
|
|
|
|
# Add the session_key to the reqinfo
|
|
reqinfo["session_key"] = session_key
|
|
|
|
# Splunk header
|
|
header = f"Splunk {session_key}"
|
|
|
|
# Data collection
|
|
collection_name = f"kv_trackme_{self.component}_tenant_{self.tenant_id}"
|
|
collection = self.service.kvstore[collection_name]
|
|
|
|
# get all records
|
|
(
|
|
collection_records,
|
|
collection_records_dict,
|
|
count_to_process_list,
|
|
) = self.get_collection_records(collection, self.min_delay_sec)
|
|
logging.debug(
|
|
f'retrieving records to be processed, collection_records="{json.dumps(collection_records, indent=2)}"'
|
|
)
|
|
|
|
"""
|
|
Logic description:
|
|
- First, we select entities that are monitored, red, have breached the delay threshold and have a current delay bigger than the min_delay_sec
|
|
- We then exclude entities that have data_override_lagging_class=true and data_allow_adaptive_delay=true
|
|
- We then exclude entities that have been processed in the past 24 hours
|
|
- We process to a ML confidence inspection, if the confidence is low, we will skip the entity, if the entity has been processed in the past 24 hours, we will skip the entity
|
|
- If the entity has been processed in the past 7 days, we will run the ML search with a restricted time range of 7 days to review if the behaviour has changed
|
|
"""
|
|
|
|
# A list to store object processed in the past 30 days prior to -1d
|
|
object_processed_past30days = []
|
|
|
|
# A list to store object processed in the past 15 days prior to -1d
|
|
object_processed_past15days = []
|
|
|
|
# A list to store object processed in the past 7 days prior to -1d
|
|
object_processed_past7days = []
|
|
|
|
# A list to store object processed in the past 24 hours
|
|
object_processed_past24hours = []
|
|
|
|
# A list to store object processed in the past 4 hours
|
|
object_processed_past4hours = []
|
|
|
|
# A list to store object processed in the past 15 days and where the threshold was increased
|
|
object_processed_past15days_threshold_increased = []
|
|
|
|
# A list to store object processed in the past 15 days and where the threshold was decreased
|
|
object_processed_past15days_threshold_decreased = []
|
|
|
|
# A list to store object processed in the past 30 days and where the threshold was increased
|
|
object_processed_past30days_threshold_increased = []
|
|
|
|
# A list to store object processed in the past 30 days and where the threshold was decreased
|
|
object_processed_past30days_threshold_decreased = []
|
|
|
|
# A list to store object processed in the past 7 days and where the threshold was increased
|
|
object_processed_past7days_threshold_increased = []
|
|
|
|
# A list to store object processed in the past 7 days and where the threshold was decreased
|
|
object_processed_past7days_threshold_decreased = []
|
|
|
|
# A list to store object processed in the past 24 hours and where the threshold was increased
|
|
object_processed_past24hours_threshold_increased = []
|
|
|
|
# A list to store object processed in the past 24 hours and where the threshold was decreased
|
|
object_processed_past24hours_threshold_decreased = []
|
|
|
|
# A list to store object processed in the past 4 hours and where the threshold was increased
|
|
object_processed_past4hours_threshold_increased = []
|
|
|
|
# A list to store object processed in the past 4 hours and where the threshold was decreased
|
|
object_processed_past4hours_threshold_decreased = []
|
|
|
|
# An interger counter of the number of changes performed during the past 7 days for each object
|
|
past7days_changes_count = 0
|
|
|
|
# An object summary dict
|
|
object_summary_dict = {}
|
|
|
|
#
|
|
# 0. Check in our logs, identify entities we have recently managed to verify if the status has changed and should be updated
|
|
# - entities processed in the last past 24 hours are added to a special list for further exclusion
|
|
# - entities processed in the last past 7 days are added to a special list for review processing
|
|
# - entities processed in the last past 15 days are added to a special list for review processing
|
|
# - entities processed in the last past 30 days are added to a special list for review processing
|
|
#
|
|
|
|
# kwargs
|
|
kwargs_recent_activity = {
|
|
"earliest_time": "-31d",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# conditionally add the earliest_time
|
|
if int(self.review_period_no_days) == 7:
|
|
kwargs_recent_activity["earliest_time"] = "-8d"
|
|
elif int(self.review_period_no_days) == 15:
|
|
kwargs_recent_activity["earliest_time"] = "-16d"
|
|
elif int(self.review_period_no_days) == 30:
|
|
kwargs_recent_activity["earliest_time"] = "-31d"
|
|
|
|
recent_activity_search = remove_leading_spaces(
|
|
self.get_recent_activity_search(tenant_audit_idx)
|
|
)
|
|
# log
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", recent activity inspection, recent_activity_search="{recent_activity_search}", kwargs="{json.dumps(kwargs_recent_activity, indent=0)}"'
|
|
)
|
|
|
|
try:
|
|
search_start = time.time()
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
recent_activity_search,
|
|
kwargs_recent_activity,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
object_summary_dict = self.get_recent_activity_item(
|
|
item,
|
|
collection_records_dict,
|
|
count_to_process_list,
|
|
collection_records,
|
|
object_processed_past30days_threshold_increased,
|
|
object_processed_past30days_threshold_decreased,
|
|
object_processed_past15days_threshold_increased,
|
|
object_processed_past15days_threshold_decreased,
|
|
object_processed_past7days_threshold_increased,
|
|
object_processed_past7days_threshold_decreased,
|
|
object_processed_past24hours_threshold_increased,
|
|
object_processed_past24hours_threshold_decreased,
|
|
object_processed_past4hours_threshold_increased,
|
|
object_processed_past4hours_threshold_decreased,
|
|
object_processed_past4hours,
|
|
object_processed_past24hours,
|
|
object_processed_past7days,
|
|
object_processed_past15days,
|
|
object_processed_past30days,
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", Processing results from recent_activity_results, result="{json.dumps(item, indent=2)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(f"Failed to execute Splunk search with error: {str(e)}")
|
|
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", recent activity search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
#
|
|
# 1. If we have entities to manage, loop though entities, run an mstats search and use ML dentisy function to define the adaptive_delay value
|
|
# Store results in a dict which will be used to update the KVstore calling the API endpoint
|
|
#
|
|
|
|
# if we have entities to be managed
|
|
|
|
# create a results dict
|
|
adaptive_delay_results = {}
|
|
|
|
# debug
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", before processing, our collection_records_dict is: {json.dumps(collection_records_dict, indent=2)}'
|
|
)
|
|
|
|
# counters for pending, we will store and render these for additional context
|
|
count_pending = 0
|
|
count_pending_list = []
|
|
count_pending_msg_list = []
|
|
|
|
# Initialize sum of execution times and count of iterations
|
|
total_execution_time = 0
|
|
iteration_count = 0
|
|
|
|
# Other initializations
|
|
max_runtime = int(self.max_runtime)
|
|
|
|
if len(collection_records) != 0:
|
|
for object_id in collection_records_dict:
|
|
|
|
# iteration start
|
|
iteration_start_time = time.time()
|
|
|
|
object_name = collection_records_dict.get(object_id).get("object")
|
|
|
|
# log
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", adaptive delay inspection, we will proceed to ML calculations for this entity'
|
|
)
|
|
|
|
# get current_max_lag_event_sec
|
|
object_current_max_lag_event_sec = collection_records_dict.get(
|
|
object_id
|
|
).get("current_max_lag_event_sec")
|
|
|
|
#
|
|
# Confidence: Verify if we have enough historical metrics to proceed
|
|
#
|
|
|
|
# boolean to defined if ML confidence check is passed
|
|
ml_confidence_check_passed = False
|
|
|
|
# initiate to low
|
|
ml_confidence = "low"
|
|
|
|
# initiate to unknown
|
|
ml_metrics_duration = "unknown"
|
|
|
|
# If the entity has been processed in the past 7 days, ML confidence check is passed already
|
|
if object_name in object_processed_past7days:
|
|
ml_confidence_check_passed = True
|
|
ml_confidence = "normal"
|
|
ml_confidence_reason = f"ML confidence is passed as this entity was processed in the past 7 days."
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}"'
|
|
)
|
|
|
|
# verify ML confidence
|
|
else:
|
|
# kwargs
|
|
kwargs_confidence = {
|
|
"earliest_time": "-30d",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
ml_confidence_search = remove_leading_spaces(
|
|
self.get_ml_condidence_search(object_name)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence_search="{ml_confidence_search}"'
|
|
)
|
|
|
|
try:
|
|
search_start = time.time()
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
ml_confidence_search,
|
|
kwargs_confidence,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", Processing results from ML confidence inspection, result="{json.dumps(item, indent=2)}"'
|
|
)
|
|
# log
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection results, ml_confidence="{item.get("confidence")}", metrics_duration="{item.get("metrics_duration")}"'
|
|
)
|
|
ml_confidence = item.get("confidence", "low")
|
|
ml_metrics_duration = item.get(
|
|
"metrics_duration", "unknown"
|
|
)
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
# set the ml_confidence_reason
|
|
if ml_confidence == "low":
|
|
ml_confidence_check_passed = False
|
|
ml_confidence_reason = f"ML has insufficient historical metrics to proceed (metrics_duration={ml_metrics_duration}, required={self.min_historical_metrics_days} days)"
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}", we will wait for confidence to be normal before proceeding this entity'
|
|
)
|
|
if object_name not in count_pending_list:
|
|
count_pending += 1
|
|
count_pending_list.append(object_name)
|
|
count_pending_msg_list.append(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}", we will wait for confidence to be normal before proceeding this entity'
|
|
)
|
|
|
|
elif ml_confidence == "normal":
|
|
ml_confidence_check_passed = True
|
|
ml_confidence_reason = f'ML has sufficient historical metrics to proceed (metrics_duration="{ml_metrics_duration}", required="{self.min_historical_metrics_days}" days)'
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", ML confidence inspection, ml_confidence="{ml_confidence}", ml_confidence_reason="{ml_confidence_reason}", we will proceed this entity'
|
|
)
|
|
|
|
#
|
|
# SLA percentage: Verify if the SLA percentage is lower than the max_sla_percentage, if not we will not proceed with this entity
|
|
#
|
|
|
|
# boolean to defined if SLA percentage check is passed, default is True unless proven otherwise
|
|
sla_percentage_check_passed = True
|
|
sla_percentage = 0
|
|
|
|
# kwargs
|
|
kwargs_sla_percentage = {
|
|
"earliest_time": "-90d",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
sla_percentage_search = remove_leading_spaces(
|
|
self.get_sla_percentage_search(object_id)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage_search="{sla_percentage_search}"'
|
|
)
|
|
|
|
try:
|
|
search_start = time.time()
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
sla_percentage_search,
|
|
kwargs_sla_percentage,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", Processing results from SLA percentage inspection, result="{json.dumps(item, indent=2)}"'
|
|
)
|
|
sla_percentage = float(item.get("percent_sla", 100))
|
|
# log
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection results, sla_percentage="{item.get("sla_percentage")}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
# set the sla_percentage_check_passed and reason
|
|
if sla_percentage > int(self.max_sla_percentage):
|
|
sla_percentage_check_passed = False
|
|
sla_percentage_reason = f"SLA percentage {sla_percentage} is greater than the max_sla_percentage {self.max_sla_percentage}, we will not proceed with this entity"
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage="{sla_percentage}", sla_percentage_reason="{sla_percentage_reason}", we will not proceed with this entity'
|
|
)
|
|
|
|
if object_name not in count_pending_list:
|
|
count_pending += 1
|
|
count_pending_list.append(object_name)
|
|
count_pending_msg_list.append(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage="{sla_percentage}", sla_percentage_reason="{sla_percentage_reason}", we will not proceed with this entity'
|
|
)
|
|
|
|
else:
|
|
sla_percentage_check_passed = True
|
|
sla_percentage_reason = f"SLA percentage {sla_percentage} is lower than the max_sla_percentage {self.max_sla_percentage}, we will proceed with this entity"
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", SLA percentage inspection, sla_percentage="{sla_percentage}", sla_percentage_reason="{sla_percentage_reason}", we will proceed this entity'
|
|
)
|
|
|
|
#
|
|
# Proceed ML investigations
|
|
#
|
|
|
|
# boolean proceed investigations (True by default)
|
|
proceed_investigations = True
|
|
|
|
# If updated in the past 4 hours, we will wait whatever the direction of the change and other conditions
|
|
if object_name in object_processed_past4hours:
|
|
proceed_investigations = False
|
|
count_pending += 1
|
|
count_pending_list.append(object_name)
|
|
count_pending_msg_list.append(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has been updated in the past 4 hours, we will wait before processing this entity again.'
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has been updated in the past 4 hours, we will wait before processing this entity again.'
|
|
)
|
|
|
|
# else if updated in the past 24 hours and the threshold was increased in the past 24 hours, we will review
|
|
elif (
|
|
object_name in object_processed_past24hours_threshold_increased
|
|
and past7days_changes_count < int(self.max_changes_past_7days)
|
|
):
|
|
proceed_investigations = True
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has been updated in the past 24 hours and the threshold was increased, we will review this entity again.'
|
|
)
|
|
|
|
# else if we have reached the number of changes allowed for a 7 days time frame, we will wait
|
|
elif past7days_changes_count > int(self.max_changes_past_7days):
|
|
proceed_investigations = False
|
|
count_pending += 1
|
|
count_pending_list.append(object_name)
|
|
count_pending_msg_list.append(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has reached the number of changes allowed for a 7 days time frame, we will wait before processing this entity again.'
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", This entity has reached the number of changes allowed for a 7 days time frame, we will wait before processing this entity again.'
|
|
)
|
|
|
|
else:
|
|
# proceed if ml confidence check is passed
|
|
if (
|
|
ml_confidence_check_passed == True
|
|
and sla_percentage_check_passed == True
|
|
):
|
|
proceed_investigations = True
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", object_summary_dict="{json.dumps(object_summary_dict, indent=0)}", conditions are met for this entity to be processed.'
|
|
)
|
|
else:
|
|
proceed_investigations = False
|
|
|
|
#
|
|
# Proceed to ML investigations
|
|
#
|
|
|
|
if (
|
|
proceed_investigations
|
|
and ml_confidence_check_passed
|
|
and sla_percentage_check_passed
|
|
):
|
|
# kwargs
|
|
kwargs_ml_mstats = {
|
|
"earliest_time": self.earliest_time_mstats,
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# search the search string
|
|
|
|
# if object has been processed in the past 7 days, we will run a more complex adaptive logic
|
|
if object_name in object_processed_past7days:
|
|
ml_mstats_search = self.get_mstats_ml_advanced_search(
|
|
object_name
|
|
)
|
|
else:
|
|
ml_mstats_search = self.get_mstats_ml_simple_search(
|
|
object_name
|
|
)
|
|
|
|
# set a version of the search but remove carriage returns for logging purposes
|
|
ml_mstats_search_for_logging = remove_leading_spaces(
|
|
ml_mstats_search
|
|
)
|
|
# remove any carriage returns
|
|
ml_mstats_search_for_logging = (
|
|
ml_mstats_search_for_logging.replace("\n", " ")
|
|
)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", running mstats search_string="{remove_leading_spaces(ml_mstats_search)}", kwargs_ml_mstats="{json.dumps(kwargs_ml_mstats, indent=2)}")'
|
|
)
|
|
|
|
try:
|
|
search_start = time.time()
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
remove_leading_spaces(ml_mstats_search),
|
|
kwargs_ml_mstats,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", Processing results from ML mstats, result="{json.dumps(item, indent=2)}"'
|
|
)
|
|
|
|
# add per entity results in the dict with the key object
|
|
|
|
# add all fields returned in item to adaptive_delay_results[object_id]
|
|
|
|
# init
|
|
adaptive_delay_results[object_id] = {}
|
|
|
|
for k, v in item.items():
|
|
adaptive_delay_results[object_id][k] = v
|
|
|
|
# add current_max_lag_event_sec which is not part of the search results
|
|
adaptive_delay_results[object_id][
|
|
"current_max_lag_event_sec"
|
|
] = object_current_max_lag_event_sec
|
|
|
|
# add ml_mstats_search_for_logging and kwargs_ml_mstats
|
|
adaptive_delay_results[object_id][
|
|
"search_string"
|
|
] = ml_mstats_search_for_logging
|
|
adaptive_delay_results[object_id][
|
|
"search_kwargs"
|
|
] = kwargs_ml_mstats
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", object="{object_name}", object_id="{object_id}", results="{json.dumps(item, indent=2)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f"Failed to execute Splunk search with error: {str(e)}"
|
|
)
|
|
msg = f'tenant_id="{self.tenant_id}", component="splk-{self.component}", ML mstats search failed with exception="{str(e)}", run_time="{time.time() - search_start}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
# Calculate the execution time for this iteration
|
|
iteration_end_time = time.time()
|
|
execution_time = iteration_end_time - iteration_start_time
|
|
|
|
# Update total execution time and iteration count
|
|
total_execution_time += execution_time
|
|
iteration_count += 1
|
|
|
|
# Calculate average execution time
|
|
if iteration_count > 0:
|
|
average_execution_time = total_execution_time / iteration_count
|
|
else:
|
|
average_execution_time = 0
|
|
|
|
# Check if there is enough time left to continue
|
|
current_time = time.time()
|
|
elapsed_time = current_time - start
|
|
if elapsed_time + average_execution_time + 120 >= max_runtime:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
|
|
)
|
|
break
|
|
|
|
#
|
|
# 2. Loop through the list adaptive_records_results_list and call the API endpoint to update the lag policy
|
|
#
|
|
|
|
logging.debug(
|
|
f"adaptive_delay_results={json.dumps(adaptive_delay_results, indent=2)}"
|
|
)
|
|
|
|
count_updated = 0
|
|
count_updated_list = []
|
|
count_updated_msg_list = []
|
|
count_processed = 0
|
|
count_processed_list = []
|
|
count_processed_msg_list = []
|
|
count_failed = 0
|
|
count_failed_list = []
|
|
count_failed_msg_list = []
|
|
|
|
for object_id in adaptive_delay_results:
|
|
entity_dict = adaptive_delay_results.get(object_id)
|
|
(
|
|
count_updated,
|
|
count_failed,
|
|
count_updated_list,
|
|
count_updated_msg_list,
|
|
count_failed_list,
|
|
count_processed,
|
|
count_processed_list,
|
|
count_processed_msg_list,
|
|
count_failed_msg_list,
|
|
) = self.run_post_api_call(
|
|
entity_dict,
|
|
header,
|
|
self.max_auto_delay_sec,
|
|
count_updated,
|
|
count_failed,
|
|
count_updated_list,
|
|
count_updated_msg_list,
|
|
count_failed_list,
|
|
count_processed,
|
|
count_processed_list,
|
|
count_processed_msg_list,
|
|
count_failed_msg_list,
|
|
)
|
|
|
|
# action results
|
|
if count_failed == 0:
|
|
action = "success"
|
|
else:
|
|
action = "failure"
|
|
|
|
# set run_time
|
|
run_time = round(time.time() - start, 3)
|
|
|
|
# call the component register
|
|
if action == "success":
|
|
self.call_component_register(
|
|
"success", "The report was executed successfully", run_time
|
|
)
|
|
else:
|
|
self.call_component_register(
|
|
"failure", json.dumps(count_failed_msg_list, indent=0), run_time
|
|
)
|
|
|
|
yield_results = {
|
|
"action": action,
|
|
"tenant_id": self.tenant_id,
|
|
"component": self.component,
|
|
"count_to_process": len(collection_records),
|
|
"count_to_process_list": count_to_process_list,
|
|
"count_processed": count_processed,
|
|
"count_processed_list": count_processed_list,
|
|
"count_processed_msg_list": count_processed_msg_list,
|
|
"count_failed": count_failed,
|
|
"count_failed_list": count_failed_list,
|
|
"count_failed_msg_list": count_failed_msg_list,
|
|
"count_updated": count_updated,
|
|
"count_updated_list": count_updated_list,
|
|
"count_updated_msg_list": count_updated_msg_list,
|
|
"count_pending": count_pending,
|
|
"count_pending_list": count_pending_list,
|
|
"count_pending_msg_list": count_pending_msg_list,
|
|
"count_processed_past30days": object_processed_past30days,
|
|
"count_processed_past15days": object_processed_past15days,
|
|
"count_processed_past7days": object_processed_past7days,
|
|
"count_processed_past24hours": object_processed_past24hours,
|
|
}
|
|
|
|
yield {
|
|
"_time": time.time(),
|
|
"_raw": yield_results,
|
|
}
|
|
|
|
# handler event
|
|
handler_events_records = []
|
|
for object_name in count_processed_list:
|
|
# Find the object_id by looking up in collection_records_dict
|
|
object_id = None
|
|
for key, value in collection_records_dict.items():
|
|
if value.get("object") == object_name:
|
|
object_id = key
|
|
break
|
|
|
|
handler_events_records.append(
|
|
{
|
|
"object": object_name,
|
|
"object_id": object_id,
|
|
"object_category": f"splk-{self.component}",
|
|
"handler": f"trackme_{self.component}_adaptive_delay_tracker_tenant_{self.tenant_id}",
|
|
"handler_message": "Entity was processed by the adaptive delay tracker.",
|
|
"handler_troubleshoot_search": f'index=_internal (sourcetype=trackme:custom_commands:trackmesplkadaptivedelay) tenant_id={self.tenant_id} object="{object_name}"',
|
|
"handler_time": time.time(),
|
|
}
|
|
)
|
|
|
|
# notification event
|
|
try:
|
|
trackme_handler_events(
|
|
session_key=self._metadata.searchinfo.session_key,
|
|
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id=self.tenant_id,
|
|
sourcetype="trackme:handler",
|
|
source=f"trackme:handler:{self.tenant_id}",
|
|
handler_events=handler_events_records,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"'
|
|
)
|
|
|
|
else:
|
|
# set run_time
|
|
run_time = round(time.time() - start, 3)
|
|
|
|
# Call the component register
|
|
self.call_component_register(
|
|
"success", "The report was executed successfully", run_time
|
|
)
|
|
|
|
yield_results = {
|
|
"action": "success",
|
|
"tenant_id": self.tenant_id,
|
|
"component": self.component,
|
|
"count_to_process": len(collection_records),
|
|
"msg": "no entities to manage currently",
|
|
}
|
|
|
|
yield {
|
|
"_time": time.time(),
|
|
"_raw": yield_results,
|
|
}
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", trackmesplkadaptivedelay has terminated, run_time={run_time}, results="{json.dumps(yield_results, indent=2)}"'
|
|
)
|
|
|
|
|
|
dispatch(AdaptiveDelay, sys.argv, sys.stdin, sys.stdout, __name__)
|