You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
600 lines
24 KiB
600 lines
24 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import hashlib
|
|
|
|
# Logging imports
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Networking imports
|
|
import urllib3
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_splk_outliers_train_helper.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
GeneratingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_reqinfo,
|
|
trackme_vtenant_account,
|
|
trackme_vtenant_component_info,
|
|
trackme_register_tenant_object_summary,
|
|
run_splunk_search,
|
|
trackme_handler_events,
|
|
)
|
|
|
|
# import trackme libs croniter
|
|
from trackme_libs_croniter import cron_to_seconds
|
|
|
|
# import trackme libs utils
|
|
from trackme_libs_utils import remove_leading_spaces
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class SplkOutliersExecutor(GeneratingCommand):
|
|
tenant_id = Option(
|
|
doc="""
|
|
**Syntax:** **tenant_id=****
|
|
**Description:** The tenant identifier.""",
|
|
require=True,
|
|
default=None,
|
|
)
|
|
|
|
component = Option(
|
|
doc="""
|
|
**Syntax:** **component=****
|
|
**Description:** The component category.""",
|
|
require=True,
|
|
default=None,
|
|
validate=validators.Match("component", r"^(?:dsm|dhm|flx|fqm|wlk)$"),
|
|
)
|
|
|
|
earliest = Option(
|
|
doc="""
|
|
**Syntax:** **earliest=****
|
|
**Description:** The earliest time quantifier.""",
|
|
require=False,
|
|
default="-24h",
|
|
)
|
|
|
|
latest = Option(
|
|
doc="""
|
|
**Syntax:** **latest=****
|
|
**Description:** The latest time quantifier.""",
|
|
require=False,
|
|
default="now",
|
|
)
|
|
|
|
max_runtime_sec = Option(
|
|
doc="""
|
|
**Syntax:** **max_runtime_sec=****
|
|
**Description:** The max runtime for the job in seconds, defaults to 60 minutes less 120 seconds of margin.""",
|
|
require=False,
|
|
default="3600",
|
|
validate=validators.Match("max_runtime_sec", r"^\d*$"),
|
|
)
|
|
|
|
def generate(self, **kwargs):
|
|
# performance counter
|
|
start = time.time()
|
|
|
|
# Track execution times
|
|
average_execution_time = 0
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# Get Virtual Tenant account
|
|
vtenant_account = trackme_vtenant_account(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
|
|
# get vtenant component info
|
|
vtenant_component_info = trackme_vtenant_component_info(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
logging.debug(
|
|
f'vtenant_component_info="{json.dumps(vtenant_component_info, indent=2)}"'
|
|
)
|
|
|
|
# check schema version migration state
|
|
try:
|
|
schema_version = int(vtenant_component_info["schema_version"])
|
|
schema_version_upgrade_in_progress = bool(
|
|
int(vtenant_component_info["schema_version_upgrade_in_progress"])
|
|
)
|
|
logging.debug(
|
|
f'schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"'
|
|
)
|
|
except Exception as e:
|
|
schema_version = 0
|
|
schema_version_upgrade_in_progress = False
|
|
logging.error(
|
|
f'failed to retrieve schema_version_upgrade_in_progress=, exception="{str(e)}"'
|
|
)
|
|
|
|
# Do not proceed if the schema version upgrade is in progress
|
|
if schema_version_upgrade_in_progress:
|
|
yield_json = {
|
|
"_time": time.time(),
|
|
"tenant_id": self.tenant_id,
|
|
"component": self.component,
|
|
"response": f'tenant_id="{self.tenant_id}", schema upgrade is currently in progress, we will wait until the process is completed before proceeding, the schema upgrade is handled by the health_tracker of the tenant and is completed once the schema_version field of the Virtual Tenants KVstore (trackme_virtual_tenants) matches TrackMe\'s version, schema_version="{schema_version}", schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"',
|
|
"schema_version": schema_version,
|
|
"schema_version_upgrade_in_progress": schema_version_upgrade_in_progress,
|
|
}
|
|
logging.info(json.dumps(yield_json, indent=2))
|
|
yield {
|
|
"_time": yield_json["_time"],
|
|
"_raw": yield_json,
|
|
}
|
|
|
|
# get mloutliers, if set 0 then ML Outliers is disabled for the tenant, 1 we can proceed
|
|
|
|
# Default to True (ML Outliers enabled)
|
|
outliers_feature_enabled = True
|
|
|
|
# Define the valid components
|
|
valid_components = {"dsm", "dhm", "flx", "fqm", "wlk"}
|
|
|
|
# Construct the key dynamically
|
|
key = f"mloutliers_{self.component}"
|
|
|
|
# Check if the component is valid and handle exceptions
|
|
if self.component in valid_components:
|
|
try:
|
|
logging.debug(
|
|
f'checking if the key "{key}" exists in the vtenant_account'
|
|
)
|
|
|
|
outliers_enablement = int(vtenant_account.get(key, 1))
|
|
logging.debug(
|
|
f'vtenant_account="{json.dumps(vtenant_account, indent=2)}", component="{self.component}", key="{key}", outliers_enablement="{outliers_enablement}"'
|
|
)
|
|
|
|
if outliers_enablement == 0:
|
|
outliers_feature_enabled = False
|
|
logging.debug(
|
|
f'the key "{key}" exists in the vtenant_account and is set to 0, ML Outliers is disabled for the tenant'
|
|
)
|
|
except (ValueError, TypeError):
|
|
outliers_feature_enabled = True
|
|
|
|
else:
|
|
logging.error(
|
|
f'component="{self.component}" is not valid, valid components are {valid_components}'
|
|
)
|
|
|
|
if not outliers_feature_enabled or schema_version_upgrade_in_progress:
|
|
|
|
if not outliers_feature_enabled:
|
|
# yield and log
|
|
results_dict = {
|
|
"tenant_id": self.tenant_id,
|
|
"action": "success",
|
|
"results": "ML Anomaly Detection feature is disabled for the tenant and component, no action taken",
|
|
"vtenant_account": vtenant_account,
|
|
}
|
|
yield {"_time": time.time(), "_raw": results_dict}
|
|
|
|
else: # procees
|
|
|
|
# max runtime
|
|
max_runtime = int(self.max_runtime_sec)
|
|
|
|
# Retrieve the search cron schedule
|
|
savedsearch_name = f"trackme_{self.component}_outliers_mltrain_tracker_tenant_{self.tenant_id}"
|
|
savedsearch = self.service.saved_searches[savedsearch_name]
|
|
savedsearch_cron_schedule = savedsearch.content["cron_schedule"]
|
|
|
|
# get the cron_exec_sequence_sec
|
|
try:
|
|
cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", failed to convert the cron schedule to seconds, error="{str(e)}"'
|
|
)
|
|
cron_exec_sequence_sec = max_runtime
|
|
|
|
# the max_runtime cannot be bigger than the cron_exec_sequence_sec
|
|
if max_runtime > cron_exec_sequence_sec:
|
|
max_runtime = cron_exec_sequence_sec
|
|
|
|
logging.info(
|
|
f'max_runtime="{max_runtime}", savedsearch_name="{savedsearch_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
|
|
)
|
|
|
|
# Get app level config
|
|
splk_outliers_time_train_mlmodels_default = reqinfo["trackme_conf"][
|
|
"splk_outliers_detection"
|
|
]["splk_outliers_time_train_mlmodels_default"]
|
|
splk_outliers_max_runtime_train_mlmodels_default = reqinfo["trackme_conf"][
|
|
"splk_outliers_detection"
|
|
]["splk_outliers_max_runtime_train_mlmodels_default"]
|
|
|
|
# Get the session key
|
|
session_key = self._metadata.searchinfo.session_key
|
|
|
|
# Get data
|
|
kwargs_oneshot = {
|
|
"earliest_time": self.earliest,
|
|
"latest_time": self.latest,
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
#
|
|
# RUN
|
|
#
|
|
|
|
# Define the search providing the list of entities which models need to be train
|
|
|
|
# note: do not allow crap entities, sometimes users get very fancy, entities with double quotes in the value
|
|
# are not expected nor desirable, do not allow crap to get in
|
|
|
|
if self.component in ("dsm", "dhm", "flx", "fqm"):
|
|
search_query = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{self.component}_outliers_entity_rules_tenant_{self.tenant_id} where object_category="splk-{self.component}"
|
|
| `trackme_exclude_badentities`
|
|
| lookup local=t trackme_{self.component}_tenant_{self.tenant_id} object OUTPUT monitored_state
|
|
| where monitored_state="enabled"
|
|
| eval duration_since_last=if(last_exec!="pending", now()-last_exec, 0)
|
|
| where duration_since_last=0 OR duration_since_last>={splk_outliers_time_train_mlmodels_default}
|
|
| sort - duration_since_last
|
|
"""
|
|
)
|
|
|
|
elif self.component in ("wlk"):
|
|
search_query = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{self.component}_outliers_entity_rules_tenant_{self.tenant_id} where object_category="splk-{self.component}"
|
|
| `trackme_exclude_badentities`
|
|
| lookup local=t trackme_{self.component}_tenant_{self.tenant_id} object OUTPUT app, monitored_state
|
|
| lookup local=t trackme_{self.component}_apps_enablement_tenant_{self.tenant_id} app OUTPUT enabled as app_enabled
|
|
| where monitored_state="enabled" AND app_enabled="True"
|
|
| eval duration_since_last=if(last_exec!="pending", now()-last_exec, 0)
|
|
| where duration_since_last=0 OR duration_since_last>={splk_outliers_time_train_mlmodels_default}
|
|
| sort - duration_since_last
|
|
"""
|
|
)
|
|
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", component="{self.component}", retrieving the list of entities to be trained from the upstream search="{search_query}"'
|
|
)
|
|
|
|
# run search
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search_query,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
# loop through the results, and train models per entity
|
|
|
|
# Store various key information
|
|
processed_entities = []
|
|
failures_entities = []
|
|
failures_entities_object_list = []
|
|
searches_toprocess = []
|
|
|
|
for item in reader:
|
|
logging.debug(f'search_results="{item}"')
|
|
|
|
current_time = time.time()
|
|
elapsed_time = current_time - start
|
|
|
|
if isinstance(item, dict):
|
|
# break if reaching the max run time less 30 seconds of margin
|
|
if (time.time() - int(start)) - 30 >= int(
|
|
splk_outliers_max_runtime_train_mlmodels_default
|
|
):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}" max_runtime="{splk_outliers_max_runtime_train_mlmodels_default}" for ML models was reached with current_runtime="{start}", job will be terminated now'
|
|
)
|
|
break
|
|
|
|
# set the search depending on the component
|
|
search_train = f'| trackmesplkoutlierstrain tenant_id="{self.tenant_id}" component="{self.component}" object="{item.get("object")}"'
|
|
|
|
# append to our list
|
|
searches_toprocess.append(
|
|
{
|
|
"object_category": item.get("object_category"),
|
|
"object": item.get("object"),
|
|
"search": search_train,
|
|
}
|
|
)
|
|
|
|
logging.debug(
|
|
f'entity_dict="{json.dumps({"object_category": item.get("object_category"), "object": item.get("object"), "search": search_train}, indent=2)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
#
|
|
# Process
|
|
#
|
|
|
|
logging.debug(
|
|
f'searched to be processed="{json.dumps(searches_toprocess, indent=2)}"'
|
|
)
|
|
|
|
# end
|
|
|
|
# errors counter
|
|
search_errors_count = 0
|
|
|
|
# Initialize sum of execution times and count of iterations
|
|
total_execution_time = 0
|
|
iteration_count = 0
|
|
|
|
# Other initializations
|
|
max_runtime = int(self.max_runtime_sec) - 120
|
|
|
|
# run
|
|
if len(searches_toprocess) > 0:
|
|
#
|
|
# process searches
|
|
#
|
|
|
|
for search_record in searches_toprocess:
|
|
search = search_record.get("search")
|
|
object_category = search_record.get("object_category")
|
|
object_value = search_record.get("object")
|
|
results_entity = []
|
|
|
|
# iteration start
|
|
iteration_start_time = time.time()
|
|
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}" Executing resulting search="{search}"'
|
|
)
|
|
|
|
# run search
|
|
substart = time.time()
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
logging.debug(f'search_results="{item}"')
|
|
results_entity.append(item)
|
|
|
|
# don't be too noisy
|
|
if reqinfo["logging_level"] == "DEBUG":
|
|
processed_entities.append(
|
|
{
|
|
"object_category": object_category,
|
|
"object": object_value,
|
|
"results_entity": results_entity,
|
|
"search": search,
|
|
"runtime": str(time.time() - substart),
|
|
}
|
|
)
|
|
else:
|
|
processed_entities.append(
|
|
{
|
|
"object_category": object_category,
|
|
"object": object_value,
|
|
"search": search,
|
|
"runtime": str(time.time() - substart),
|
|
}
|
|
)
|
|
|
|
# only in debug
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}" search successfully executed in {time.time() - substart} seconds'
|
|
)
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", component="{self.component}", search="{search}", main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
search_errors_count += 1
|
|
failures_entities.append(
|
|
{
|
|
"object_category": object_category,
|
|
"object": object_value,
|
|
"results_entity": {
|
|
"action": "failure",
|
|
"search": search,
|
|
"exception": str(e),
|
|
},
|
|
}
|
|
)
|
|
failures_entities_object_list.append(
|
|
{"object": object_value, "exception": str(e)}
|
|
)
|
|
|
|
# Calculate the execution time for this iteration
|
|
iteration_end_time = time.time()
|
|
execution_time = iteration_end_time - iteration_start_time
|
|
|
|
# Update total execution time and iteration count
|
|
total_execution_time += execution_time
|
|
iteration_count += 1
|
|
|
|
# Calculate average execution time
|
|
if iteration_count > 0:
|
|
average_execution_time = total_execution_time / iteration_count
|
|
else:
|
|
average_execution_time = 0
|
|
|
|
# Check if there is enough time left to continue
|
|
current_time = time.time()
|
|
elapsed_time = current_time - start
|
|
if elapsed_time + average_execution_time + 120 >= max_runtime:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{self.component}", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
|
|
)
|
|
break
|
|
|
|
#
|
|
# end process searches loop
|
|
#
|
|
|
|
# yield and log
|
|
results_dict = {
|
|
"tenant_id": self.tenant_id,
|
|
"action": "success",
|
|
"results": "outliers models training job successfully executed",
|
|
"run_time": round((time.time() - start), 3),
|
|
"entities_count": len(searches_toprocess),
|
|
"processed_entities": processed_entities,
|
|
"failures_entities": failures_entities,
|
|
"search_errors_count": search_errors_count,
|
|
"upstream_search_query": search_query,
|
|
}
|
|
yield {"_time": time.time(), "_raw": results_dict}
|
|
logging.info(json.dumps(results_dict, indent=2))
|
|
|
|
# handler event
|
|
handler_events_records = []
|
|
for object_record in processed_entities:
|
|
handler_events_records.append(
|
|
{
|
|
"object": object_record.get("object"),
|
|
"object_id": hashlib.sha256(
|
|
object_record.get("object").encode("utf-8")
|
|
).hexdigest(),
|
|
"object_category": f"splk-{self.component}",
|
|
"handler": f"trackme_{self.component}_outliers_mltrain_tracker_tenant_{self.tenant_id}",
|
|
"handler_message": "Entity was trained for ML Outliers.",
|
|
"handler_troubleshoot_search": f"index=_internal (sourcetype=trackme:custom_commands:trackmesplkoutlierstrain) tenant_id={self.tenant_id} object=\"{object_record.get('object')}\"",
|
|
"handler_time": time.time(),
|
|
}
|
|
)
|
|
|
|
# notification event
|
|
try:
|
|
trackme_handler_events(
|
|
session_key=self._metadata.searchinfo.session_key,
|
|
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id=self.tenant_id,
|
|
sourcetype="trackme:handler",
|
|
source=f"trackme:handler:{self.tenant_id}",
|
|
handler_events=handler_events_records,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component=f"splk-{self.component}", could not send notification event, exception="{e}"'
|
|
)
|
|
|
|
else:
|
|
# yield and log
|
|
results_dict = {
|
|
"tenant_id": self.tenant_id,
|
|
"action": "success",
|
|
"results": "outliers models training job successfully executed but there were no entities to be trained at this time",
|
|
"run_time": round((time.time() - start), 3),
|
|
"entities_count": len(searches_toprocess),
|
|
"upstream_search_query": search_query,
|
|
}
|
|
yield {"_time": time.time(), "_raw": results_dict}
|
|
logging.info(json.dumps(results_dict, indent=2))
|
|
|
|
# Call the component register
|
|
report_name = f"trackme_{self.component}_outliers_mltrain_tracker_tenant_{self.tenant_id}"
|
|
if search_errors_count == 0:
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
f"splk-{self.component}",
|
|
report_name,
|
|
"success",
|
|
time.time(),
|
|
round(time.time() - start, 3),
|
|
"The report was executed successfully",
|
|
"-24h",
|
|
"now",
|
|
)
|
|
|
|
else:
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
f"splk-{self.component}",
|
|
report_name,
|
|
"failure",
|
|
time.time(),
|
|
round(time.time() - start, 3),
|
|
failures_entities_object_list,
|
|
"-24h",
|
|
"now",
|
|
)
|
|
|
|
|
|
dispatch(SplkOutliersExecutor, sys.argv, sys.stdin, sys.stdout, __name__)
|