You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmetrackerhealth.py

4485 lines
219 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import os
import sys
import time
import json
import uuid
import threading
import hashlib
from logging.handlers import RotatingFileHandler
# Logging imports
import logging
from logging.handlers import RotatingFileHandler
# Networking imports
import requests
from requests.structures import CaseInsensitiveDict
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
f"{splunkhome}/var/log/splunk/trackme_tracker_health.log",
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# import Splunk libs
from splunklib.searchcommands import (
dispatch,
GeneratingCommand,
Configuration,
Option,
validators,
)
import splunklib.results as results
# import trackme libs
from trackme_libs import (
trackme_reqinfo,
trackme_register_tenant_object_summary,
trackme_delete_tenant_object_summary,
trackme_vtenant_account,
trackme_idx_for_tenant,
trackme_state_event,
trackme_register_tenant_component_summary,
trackme_handler_events,
trackme_manage_report_schedule,
trackme_get_version,
)
# import trackme licensing libs
from trackme_libs_licensing import trackme_check_license
# import trackme libs
from trackme_libs import (
trackme_report_update_enablement,
run_splunk_search,
trackme_gen_state,
)
# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces, decode_unicode
# import trackme libs logical groups
from trackme_libs_logicalgroup import (
get_logical_groups_collection_records,
logical_group_remove_object_from_groups,
logical_group_delete_group_by_name,
)
# import TrackMe get data libs
from trackme_libs_get_data import (
get_full_kv_collection,
)
# import default vtenant account settings
from collections_data import vtenant_account_default
# import trackme libs sla
from trackme_libs_sla import trackme_sla_gen_metrics
# import trackme libs schema
from trackme_libs_schema import trackme_schema_format_version
@Configuration(distributed=False)
class HealthTracker(GeneratingCommand):
tenant_id = Option(
doc="""
**Syntax:** **tenant_id=****
**Description:** The tenant identifier.""",
require=True,
default=None,
)
get_acl = Option(
doc="""
**Syntax:** **get_acl=****
**Description:** Retrieve ACLs information for the tenant knowledge objects, disabled by default as this can generate more rest traffic and load.""",
require=False,
default=False,
validate=validators.Boolean(),
)
"""
Function to return a unique uuid which is used to trace performance run_time of each subtask.
"""
def get_uuid(self):
return str(uuid.uuid4())
def register_component_summary_async(
self, session_key, splunkd_uri, tenant_id, component
):
try:
summary_register_response = trackme_register_tenant_component_summary(
session_key,
splunkd_uri,
tenant_id,
component,
)
logging.debug(
f'function="trackme_register_tenant_component_summary", response="{json.dumps(summary_register_response, indent=2)}"'
)
except Exception as e:
logging.error(
f'failed to register the component summary with exception="{str(e)}"'
)
def generate(self, **kwargs):
# performance counter
start = time.time()
# set instance_id
instance_id = self.get_uuid()
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
)
log.setLevel(reqinfo["logging_level"])
# Build header and target URL
headers = CaseInsensitiveDict()
headers["Authorization"] = f"Splunk {self._metadata.searchinfo.session_key}"
headers["Content-Type"] = "application/json"
# Create a requests session for better performance
session = requests.Session()
session.headers.update(headers)
###########################################################################
# Verify the Virtual Tenant account with privileges escalation
###########################################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "check_vtenant_accounts"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verifying the vtenant account'
)
try:
vtenant_account = trackme_vtenant_account(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
except Exception as e:
# target
url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/configuration/admin/maintain_vtenant_account"
# proceed
try:
response = session.post(
url,
data=json.dumps(
{"tenant_id": self.tenant_id, "force_create_missing": True}
),
verify=False,
timeout=600,
)
if response.status_code not in (200, 201, 204):
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verify vtenant account has failed, was this account deleted by mistake? response.status_code="{response.status_code}", response.text="{response.text}"'
)
raise Exception(f'verify vtenant account has failed, was this account deleted by mistake? response.status_code="{response.status_code}", response.text="{response.text}"')
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, account was verified successfully'
)
response_json = response.json()
# fetch the vtenant account again
vtenant_account = trackme_vtenant_account(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verify vtenant account has failed, exception="{str(e)}"'
)
raise Exception(f'verify vtenant account has failed, exception="{str(e)}"')
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
#
#
# get the target index
tenant_indexes = trackme_idx_for_tenant(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
)
# get global indexes
global_indexes = {
"trackme_summary_idx": reqinfo["trackme_conf"]["index_settings"][
"trackme_summary_idx"
],
"trackme_audit_idx": reqinfo["trackme_conf"]["index_settings"][
"trackme_audit_idx"
],
"trackme_metric_idx": reqinfo["trackme_conf"]["index_settings"][
"trackme_metric_idx"
],
"trackme_notable_idx": reqinfo["trackme_conf"]["index_settings"][
"trackme_notable_idx"
],
}
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, global_indexes="{json.dumps(global_indexes, indent=2)}"'
)
# get trackme release
trackme_version = trackme_get_version(
self.service,
log_context={
"context_prefix": f'tenant_id="{self.tenant_id}", instance_id={instance_id}'
}
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, running trackme version="{trackme_version}"'
)
# set the schema_version_required
schema_version_required = trackme_schema_format_version(trackme_version)
# Get the session key
session_key = self._metadata.searchinfo.session_key
# Add the session_key to the reqinfo
reqinfo["session_key"] = session_key
# report name for logging purposes
report_name = f"trackme_health_tracker_tenant_{self.tenant_id}"
# Data collection
collection_name = "kv_trackme_virtual_tenants"
collection = self.service.kvstore[collection_name]
# Get the tenant KVrecord
query_string = {
"tenant_id": self.tenant_id,
}
vtenant_record = collection.data.query(query=json.dumps(query_string))[0]
#
# check license state
#
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "check_licensing"
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
try:
check_license = trackme_check_license(
reqinfo["server_rest_uri"], session_key
)
license_is_valid = check_license.get("license_is_valid")
license_subscription_class = check_license.get("license_subscription_class")
license_active_tenants = check_license.get("license_active_tenants")
license_active_tenants_list = check_license.get(
"license_active_tenants_list"
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function check_license called, task_instance_id={task_instance_id}, license_is_valid="{license_is_valid}", license_subscription_class="{license_subscription_class}", license_active_tenants="{license_active_tenants}", license_active_tenants_list="{license_active_tenants_list}"'
)
except Exception as e:
license_is_valid = 2
license_subscription_class = "unlimited"
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function check_license has failed, exception="{str(e)}"'
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
# check tenants indexes settings:
# - retrieve the configured indexes for the tenant
# - retrieve via a REST call to splunkd the list of declared indexes on the search head
# - if any of the tenant defines indexes are not declared on the search head, update the tenant indexes settings to fallback to TrackMe default indexes and log the issue
#
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "check_tenants_indexes_settings"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
def get_indexes_by_datatype(datatype=None):
"""Retrieve indexes from the search head by datatype.
Args:
datatype (str, optional): The datatype to filter by (e.g. 'metric').
If None, retrieves all indexes.
Returns:
dict: Dictionary of index names and their datatypes
"""
url = f"{reqinfo['server_rest_uri']}/services/data/indexes?output_mode=json&count=0"
if datatype:
url += f"&datatype={datatype}"
try:
response = requests.get(url, headers=headers, verify=False, timeout=600)
if response.status_code == 200:
indexes_raw = response.json().get("entry", [])
for index in indexes_raw:
if isinstance(index, dict):
index_name = index.get("name")
if index_name:
declared_indexes_dict[index_name] = {
"datatype": index.get("content", {}).get(
"datatype", ""
)
}
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, declared_indexes="{json.dumps(declared_indexes_dict, indent=2)}"'
)
else:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve indexes list, status code: {response.status_code}'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, could not retrieve the list of declared indexes on the search head, exception="{str(e)}"'
)
def get_fallback_indexes(index_category=None):
"""Retrieve fallback indexes from the search head.
Returns:
dict: Dictionary of fallback indexes
"""
fallback_indexes = {
"trackme_summary_idx": "trackme_summary",
"trackme_audit_idx": "trackme_audit",
"trackme_metric_idx": "trackme_metrics",
"trackme_notable_idx": "trackme_notable",
}
if index_category:
return fallback_indexes.get(index_category, None)
else:
return fallback_indexes
# get the tenant indexes settings
tenant_indexes_settings = trackme_idx_for_tenant(
session_key,
reqinfo["server_rest_uri"],
self.tenant_id,
)
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_indexes_settings="{json.dumps(tenant_indexes_settings, indent=2)}"'
)
""" Example of tenant_indexes_settings:
{
"trackme_summary_idx": "trackme_summary",
"trackme_audit_idx": "trackme_audit",
"trackme_metric_idx": "trackme_metrics",
"trackme_notable_idx": "trackme_notable"
}
"""
# check if tenant_indexes_settings is set to global
tenant_indexes_uses_global_indexes = False
if tenant_indexes_settings == "global":
tenant_indexes_settings = global_indexes
tenant_indexes_uses_global_indexes = True
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_indexes_settings set to global, will check the search head for declared indexes.'
)
# process
declared_indexes_dict = {}
# Get all indexes (events)
get_indexes_by_datatype()
# Get metrics indexes
get_indexes_by_datatype(datatype="metric")
# only proceed if we have declared indexes
if not declared_indexes_dict:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no declared indexes found, skipping tenant indexes settings check.'
)
return
# for each index in the tenant indexes settings, check if it is declared on the search head
# we also want to check for trackme_metrics_idx that the datatype is set to "metric"
# if not, we will force update the tenant indexes settings to fallback to TrackMe default indexes
invalid_indexes_settings_detected = False
# process the tenant indexes settings
for index_category, index_value in tenant_indexes_settings.items():
if not isinstance(index_value, str):
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}" has invalid index value type: {type(index_value)}'
)
invalid_indexes_settings_detected = True
# update the tenant indexes settings for the current index_category
tenant_indexes_settings[index_category] = get_fallback_indexes(
index_category
)
continue
if index_value not in declared_indexes_dict:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}", index_value="{index_value}" is not declared on the search head, this is an invalid configuration, we will force update the tenant indexes settings to fallback to TrackMe default indexes. Please ensure to define indexes in the search head tier before attempting to configure your tenant indexes settings.'
)
invalid_indexes_settings_detected = True
# update the tenant indexes settings for the current index_category
tenant_indexes_settings[index_category] = get_fallback_indexes(
index_category
)
continue
elif index_category == "trackme_metrics_idx":
index_info = declared_indexes_dict.get(index_value, {})
if index_info.get("datatype") != "metric":
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}", index_value="{index_value}" is not configured as a metric index, this is an invalid configuration, we will force update the tenant indexes settings to fallback to TrackMe default indexes.'
)
invalid_indexes_settings_detected = True
# update the tenant indexes settings for the current index_category
tenant_indexes_settings[index_category] = get_fallback_indexes(
index_category
)
continue
if not invalid_indexes_settings_detected:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no invalid indexes settings detected, nothing to do.'
)
else:
# If we were using global indexes and found issues, we need to fallback to default indexes
if tenant_indexes_uses_global_indexes:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, issues detected with global indexes, falling back to default indexes.'
)
tenant_indexes_settings = get_fallback_indexes()
# fix the tenant indexes settings
vtenant_record["tenant_idx_settings"] = json.dumps(
tenant_indexes_settings, indent=2
)
try:
self.service.kvstore["kv_trackme_virtual_tenants"].data.update(
str(vtenant_record["_key"]), json.dumps(vtenant_record)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully, new tenant_idx_settings="{json.dumps(tenant_indexes_settings, indent=2)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
##################################################################################
# Global system verifications: verify that the relevant scheduled jobs are enabled
##################################################################################
# These jobs are not tenant specifics, however we use the health tracker to ensure that
# these are effectively enabled when at least one tenant has been created and is active
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "check_global_trackers_enablement"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
savedsearch_names = [
"trackme_ack_expiration_tracker",
"trackme_maintenance_mode_tracker",
"trackme_backup_scheduler",
"trackme_general_health_manager",
]
for savedsearch_name in savedsearch_names:
# check ack expiration tracker
update_properties_required = False
try:
mysavedsearch = self.service.saved_searches[savedsearch_name]
current_disabled = int(mysavedsearch["disabled"])
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, verifying savedsearch="{mysavedsearch.name}", disabled="{current_disabled}"'
)
if current_disabled == 1:
update_properties_required = True
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global configuration verification, could not retrieve the status for {savedsearch_name}'
)
if update_properties_required:
try:
action = trackme_report_update_enablement(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
savedsearch_name,
"enable",
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, enabling savedsearch="{savedsearch_name}", result="{action}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, an exception was encountered while trying to enable savedsearch="{savedsearch_name}", exception="{str(e)}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
##################################################################################
# Optimize: enable or disable the schedule for utilities depending on the tenant
# settings, and conditions
##################################################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "optimize_tenant_scheduled_reports"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# Define the valid components
valid_components = {"dsm", "dhm", "mhm", "flx", "wlk", "fqm"}
def manage_savedsearch_schedule(
savedsearch_names, feature_enabled, feature_name
):
"""
Helper function to manage saved search scheduling based on feature enablement.
Args:
savedsearch_names: List of saved search names to manage
feature_enabled: Boolean indicating if the feature should be enabled
feature_name: String name of the feature for logging purposes
"""
for savedsearch_name in savedsearch_names:
# get the status of the savedsearch
savedsearch_properties, savedsearch_acl = (
trackme_manage_report_schedule(
logging,
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
savedsearch_name,
action="status",
)
)
# log
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", savedsearch_properties="{json.dumps(savedsearch_properties, indent=2)}", savedsearch_acl="{json.dumps(savedsearch_acl, indent=2)}"'
)
# get the is_scheduled status
is_scheduled = int(savedsearch_properties.get("is_scheduled", 0))
# avoid failing to schedule the savedsearch if any of the following is missing or equal to None:
# dispatch.earliest_time
# dispatch.latest_time
# cron_schedule
# schedule_window
# outliers_mltrain:
# "cron_schedule": "*/60 * * * *",
# "dispatch.earliest_time": "-5m",
# "dispatch.latest_time": "now",
# "schedule_window": "5",
# outliers_mlmonitor:
# "cron_schedule": "*/20 * * * *",
# "dispatch.earliest_time": "-5m",
# "dispatch.latest_time": "now",
# "schedule_window": "5",
# data_sampling:
# "cron_schedule": "*/20 * * * *",
# "dispatch.earliest_time": "-24h",
# "dispatch.latest_time": "-4h",
# "schedule_window": "5",
# adaptive_delay:
# "cron_schedule": "*/20 * * * *",
# "dispatch.earliest_time": "-5m",
# "dispatch.latest_time": "now",
# "schedule_window": "5",
# delayed_inspector:
# "cron_schedule": "*/20 * * * *",
# "dispatch.earliest_time": "-5m",
# "dispatch.latest_time": "now",
# "schedule_window": "5",
# if any of these parameters is missing in the savedsearch properties, we need to add them
if "dispatch.earliest_time" not in savedsearch_properties or savedsearch_properties.get("dispatch.earliest_time") in (None, 'None', ''):
if "outliers_mltrain" in savedsearch_name:
savedsearch_properties["dispatch.earliest_time"] = "-5m"
elif "outliers_mlmonitor" in savedsearch_name:
savedsearch_properties["dispatch.earliest_time"] = "-5m"
elif "data_sampling" in savedsearch_name:
savedsearch_properties["dispatch.earliest_time"] = "-24h"
elif "adaptive_delay" in savedsearch_name:
savedsearch_properties["dispatch.earliest_time"] = "-5m"
elif "delayed_entities_inspector" in savedsearch_name:
savedsearch_properties["dispatch.earliest_time"] = "-5m"
else:
savedsearch_properties["dispatch.earliest_time"] = "-5m"
if "dispatch.latest_time" not in savedsearch_properties or savedsearch_properties.get("dispatch.latest_time") in (None, 'None', ''):
if "outliers_mltrain" in savedsearch_name:
savedsearch_properties["dispatch.latest_time"] = "now"
elif "outliers_mlmonitor" in savedsearch_name:
savedsearch_properties["dispatch.latest_time"] = "now"
elif "data_sampling" in savedsearch_name:
savedsearch_properties["dispatch.latest_time"] = "-4h"
elif "adaptive_delay" in savedsearch_name:
savedsearch_properties["dispatch.latest_time"] = "now"
elif "delayed_entities_inspector" in savedsearch_name:
savedsearch_properties["dispatch.latest_time"] = "now"
else:
savedsearch_properties["dispatch.latest_time"] = "now"
if "cron_schedule" not in savedsearch_properties or savedsearch_properties.get("cron_schedule") in (None, 'None', ''):
if "outliers_mltrain" in savedsearch_name:
savedsearch_properties["cron_schedule"] = "0 22-23,0-6 * * *"
elif "outliers_mlmonitor" in savedsearch_name:
savedsearch_properties["cron_schedule"] = "*/20 * * * *"
elif "data_sampling" in savedsearch_name:
savedsearch_properties["cron_schedule"] = "*/20 22-23,0-6 * * *"
elif "adaptive_delay" in savedsearch_name:
savedsearch_properties["cron_schedule"] = "*/20 * * * *"
elif "delayed_entities_inspector" in savedsearch_name:
savedsearch_properties["cron_schedule"] = "*/20 * * * *"
else:
savedsearch_properties["cron_schedule"] = "*/5 * * * *"
if "schedule_window" not in savedsearch_properties or savedsearch_properties.get("schedule_window") in (None, 'None', ''):
savedsearch_properties["schedule_window"] = "5"
# act
if is_scheduled == 1 and feature_enabled == False:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", disabling savedsearch.'
)
try:
savedsearch_properties, savedsearch_acl = (
trackme_manage_report_schedule(
logging,
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
savedsearch_name,
input_report_properties=savedsearch_properties,
action="disable",
)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch updated successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to update savedsearch, exception="{str(e)}"'
)
elif is_scheduled == 0 and feature_enabled == True:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", enabling savedsearch.'
)
try:
savedsearch_properties, savedsearch_acl = (
trackme_manage_report_schedule(
logging,
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
savedsearch_name,
input_report_properties=savedsearch_properties,
action="enable",
)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch updated successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to update savedsearch, exception="{str(e)}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", nothing to do.'
)
# Process except for replica tenants
try:
tenant_replica = int(vtenant_record.get("tenant_replica", 0))
except Exception as e:
tenant_replica = 0
if tenant_replica == 1:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, detected replica tenant by name pattern, setting tenant_replica=1'
)
# Log replica tenant status for debugging
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_replica="{tenant_replica}", will_process="{tenant_replica == 0}"'
)
if tenant_replica == 0: # only process non-replica tenants (value: 0)
for valid_component in valid_components:
valid_component_is_enabled = int(
vtenant_record.get(f"tenant_{valid_component}_enabled", 0)
)
if valid_component_is_enabled == 1:
# only for dsm/dhm/flx/wlk
if valid_component in ("dsm", "dhm", "flx", "wlk", "fqm"):
#
# ML Outliers
#
try:
savedsearch_names = [
f"trackme_{valid_component}_outliers_mltrain_tracker_tenant_{self.tenant_id}",
f"trackme_{valid_component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}",
]
# Default to True
feature_enabled = True
# Construct the key dynamically
key = f"mloutliers_{valid_component}"
# Check if the component is valid and handle exceptions
if valid_component in valid_components:
try:
feature_enablement = int(vtenant_account.get(key, 1))
if feature_enablement == 0:
feature_enabled = False
except (ValueError, TypeError):
feature_enabled = True
else:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "outliers"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# Sampling (dsm only)
#
try:
if valid_component == "dsm":
savedsearch_names = [
f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}",
]
# Default to True
feature_enabled = True
# Construct the key dynamically
key = f"sampling"
# Check if the component is valid and handle exceptions
if valid_component in valid_components:
try:
feature_enablement = int(vtenant_account.get(key, 1))
if feature_enablement == 0:
feature_enabled = False
except (ValueError, TypeError):
feature_enabled = True
else:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "sampling"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# Adaptive delay (dsm only)
#
try:
if valid_component == "dsm":
savedsearch_names = [
f"trackme_dsm_adaptive_delay_tracker_tenant_{self.tenant_id}",
]
# Default to True
feature_enabled = True
# Construct the key dynamically
key = f"adaptive_delay"
# Check if the component is valid and handle exceptions
if valid_component in valid_components:
try:
feature_enablement = int(vtenant_account.get(key, 1))
if feature_enablement == 0:
feature_enabled = False
except (ValueError, TypeError):
feature_enabled = True
else:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "adaptive_delay"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# Delayed inspector (dsm/dhm only)
#
try:
if valid_component in ("dsm", "dhm"):
savedsearch_names = [
f"trackme_{valid_component}_delayed_entities_inspector_tracker_tenant_{self.tenant_id}",
]
# Default to True
feature_enabled = True
# Construct the key dynamically
keys = [
"splk_feeds_delayed_inspector_24hours_range_min_sec",
"splk_feeds_delayed_inspector_7days_range_min_sec",
"splk_feeds_delayed_inspector_until_disabled_range_min_sec",
]
# Check if the component is valid and handle exceptions (all keys must be set to 0 for the feature to be disabled)
if valid_component in valid_components:
try:
feature_enabled = True # Default to enabled
for key in keys:
feature_enablement = int(
vtenant_account.get(key, 1)
)
if feature_enablement != 0:
# If any key is not 0, the feature should be enabled
feature_enabled = True
break
else:
# If we get here, all keys were 0, so disable the feature
feature_enabled = False
except (ValueError, TypeError):
feature_enabled = True
else:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "delayed_inspector"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# Priority policies: depends on if we have content in the KVstore collection
#
try:
savedsearch_names = [
f"trackme_{valid_component}_priority_tracker_tenant_{self.tenant_id}",
]
priority_collection_name = f"kv_trackme_{valid_component}_priority_policies_tenant_{self.tenant_id}"
priority_collection = self.service.kvstore[priority_collection_name]
(
priority_records,
priority_collection_keys,
priority_collection_dict,
) = get_full_kv_collection(
priority_collection, priority_collection_name
)
# check if we have content in the collection
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", priority_collection_name="{priority_collection_name}", priority_records_count="{len(priority_records)}"'
)
feature_enabled = bool(priority_records)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "priority_policies"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# Tags policies: depends on if we have content in the KVstore collection
#
try:
savedsearch_names = [
f"trackme_{valid_component}_tags_tracker_tenant_{self.tenant_id}",
]
tags_collection_name = f"kv_trackme_{valid_component}_tags_policies_tenant_{self.tenant_id}"
tags_collection = self.service.kvstore[tags_collection_name]
tags_records, tags_collection_keys, tags_collection_dict = (
get_full_kv_collection(tags_collection, tags_collection_name)
)
# check if we have content in the collection
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", tags_collection_name="{tags_collection_name}", tags_records_count="{len(tags_records)}"'
)
feature_enabled = bool(tags_records)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "tags_policies"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# SLA policies: depends on if we have content in the KVstore collection
#
try:
savedsearch_names = [
f"trackme_{valid_component}_sla_tracker_tenant_{self.tenant_id}",
]
sla_collection_name = f"kv_trackme_{valid_component}_sla_policies_tenant_{self.tenant_id}"
sla_collection = self.service.kvstore[sla_collection_name]
sla_records, sla_collection_keys, sla_collection_dict = (
get_full_kv_collection(sla_collection, sla_collection_name)
)
# check if we have content in the collection
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", tags_collection_name="{tags_collection_name}", tags_records_count="{len(sla_records)}"'
)
feature_enabled = bool(tags_records)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "sla_policies"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
#
# Shared Elastic Tracker: depends on if we have content in the KVstore collection (dsm only)
#
try:
if valid_component == "dsm":
savedsearch_names = [
f"trackme_dsm_shared_elastic_tracker_tenant_{self.tenant_id}",
]
shared_elastic_collection_name = (
f"kv_trackme_dsm_elastic_shared_tenant_{self.tenant_id}"
)
shared_elastic_collection = self.service.kvstore[
shared_elastic_collection_name
]
(
shared_elastic_records,
shared_elastic_collection_keys,
shared_elastic_collection_dict,
) = get_full_kv_collection(
shared_elastic_collection, shared_elastic_collection_name
)
# check if we have content in the collection
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", shared_elastic_collection_name="{shared_elastic_collection_name}", shared_elastic_records_count="{len(shared_elastic_records)}"'
)
feature_enabled = bool(shared_elastic_records)
manage_savedsearch_schedule(
savedsearch_names, feature_enabled, "shared_elastic"
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
else:
# Skip processing for replica tenants
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, skipping replica tenant processing, tenant_replica="{tenant_replica}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
##################################################################################
# Replica orchestrator
##################################################################################
# This job scheduled will automatically be enabled if we detect that at least one
# replica tracker has been created
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "replica_orchestrator"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# Try to get the current definition
try:
tenant_replica_objects = vtenant_record.get("tenant_replica_objects")
# logging debug
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_replica_objects="{tenant_replica_objects}"'
)
except Exception as e:
tenant_replica_objects = None
# only run if we have a proper replica object
if tenant_replica_objects:
savedsearch_names = [
"trackme_replica_executor",
]
for savedsearch_name in savedsearch_names:
# check
update_properties_required = False
try:
mysavedsearch = self.service.saved_searches[savedsearch_name]
current_disabled = int(mysavedsearch["disabled"])
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, verifying savedsearch="{mysavedsearch.name}", disabled="{current_disabled}"'
)
if current_disabled == 1:
update_properties_required = True
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica configuration verification, could not retrieve the status for {savedsearch_name}'
)
if update_properties_required:
try:
action = trackme_report_update_enablement(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
savedsearch_name,
"enable",
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, enabling savedsearch="{savedsearch_name}", result="{action}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, an exception was encountered while trying to enable savedsearch="{savedsearch_name}", exception="{str(e)}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
###########################################################################
# schema update and migration: detect and migrate Virtual Tenants if needed
###########################################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "schema_upgrade"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
from trackme_libs_schema import (
trackme_schema_get_version,
trackme_schema_update_version,
trackme_schema_upgrade_2009,
trackme_schema_upgrade_2015,
trackme_schema_upgrade_2016,
trackme_schema_upgrade_2020,
trackme_schema_upgrade_2026,
trackme_schema_upgrade_2034,
trackme_schema_upgrade_2034_least_privileges,
trackme_schema_upgrade_2036,
trackme_schema_upgrade_2038,
trackme_schema_upgrade_2043,
trackme_schema_upgrade_2044,
trackme_schema_upgrade_2045,
trackme_schema_upgrade_2054,
trackme_schema_upgrade_2064,
trackme_schema_upgrade_2067,
trackme_schema_upgrade_2070,
trackme_schema_upgrade_2071,
trackme_schema_upgrade_2072,
trackme_schema_upgrade_2075,
trackme_schema_upgrade_2078,
trackme_schema_upgrade_2083,
trackme_schema_upgrade_2084,
trackme_schema_upgrade_2087,
trackme_schema_upgrade_2089,
trackme_schema_upgrade_2090,
trackme_schema_upgrade_2091,
trackme_schema_upgrade_2094,
trackme_schema_upgrade_2095,
trackme_schema_upgrade_2096,
trackme_schema_upgrade_2097,
trackme_schema_upgrade_2098,
trackme_schema_upgrade_2099,
trackme_schema_upgrade_2100,
trackme_schema_upgrade_2101,
trackme_schema_upgrade_2102,
trackme_schema_upgrade_2104,
trackme_schema_upgrade_2105,
trackme_schema_upgrade_2107,
trackme_schema_upgrade_2108,
trackme_schema_upgrade_2109,
trackme_schema_upgrade_2110,
trackme_schema_upgrade_2111,
trackme_schema_upgrade_2116,
trackme_schema_upgrade_2118,
trackme_schema_upgrade_2119,
trackme_schema_upgrade_2121,
trackme_schema_upgrade_2122,
trackme_schema_upgrade_2123,
trackme_schema_upgrade_2126,
trackme_schema_upgrade_2128,
trackme_schema_upgrade_2130,
trackme_schema_upgrade_2131,
trackme_schema_upgrade_2132,
trackme_schema_upgrade_2300,
trackme_schema_upgrade_2304,
trackme_schema_upgrade_2305,
)
# Define a mapping between schema versions and their upgrade functions
schema_upgrades = [
(2009, trackme_schema_upgrade_2009),
(2015, trackme_schema_upgrade_2015),
(2016, trackme_schema_upgrade_2016),
(2020, trackme_schema_upgrade_2020),
(2026, trackme_schema_upgrade_2026),
(2034, trackme_schema_upgrade_2034),
(2034, trackme_schema_upgrade_2034_least_privileges),
(2036, trackme_schema_upgrade_2036),
(2038, trackme_schema_upgrade_2038),
(2043, trackme_schema_upgrade_2043),
(2043, trackme_schema_upgrade_2044),
(2045, trackme_schema_upgrade_2045),
(2054, trackme_schema_upgrade_2054),
(2064, trackme_schema_upgrade_2064),
(2067, trackme_schema_upgrade_2067),
(2070, trackme_schema_upgrade_2070),
(2071, trackme_schema_upgrade_2071),
(2072, trackme_schema_upgrade_2072),
(2075, trackme_schema_upgrade_2075),
(2078, trackme_schema_upgrade_2078),
(2083, trackme_schema_upgrade_2083),
(2084, trackme_schema_upgrade_2084),
(2087, trackme_schema_upgrade_2087),
(2089, trackme_schema_upgrade_2089),
(2090, trackme_schema_upgrade_2090),
(2091, trackme_schema_upgrade_2091),
(2094, trackme_schema_upgrade_2094),
(2095, trackme_schema_upgrade_2095),
(2096, trackme_schema_upgrade_2096),
(2097, trackme_schema_upgrade_2097),
(2098, trackme_schema_upgrade_2098),
(2099, trackme_schema_upgrade_2099),
(2100, trackme_schema_upgrade_2100),
(2101, trackme_schema_upgrade_2101),
(2102, trackme_schema_upgrade_2102),
(2104, trackme_schema_upgrade_2104),
(2105, trackme_schema_upgrade_2105),
(2107, trackme_schema_upgrade_2107),
(2108, trackme_schema_upgrade_2108),
(2109, trackme_schema_upgrade_2109),
(2110, trackme_schema_upgrade_2110),
(2111, trackme_schema_upgrade_2111),
(2116, trackme_schema_upgrade_2116),
(2118, trackme_schema_upgrade_2118),
(2119, trackme_schema_upgrade_2119),
(2121, trackme_schema_upgrade_2121),
(2122, trackme_schema_upgrade_2122),
(2123, trackme_schema_upgrade_2123),
(2126, trackme_schema_upgrade_2126),
(2128, trackme_schema_upgrade_2128),
(2130, trackme_schema_upgrade_2130),
(2131, trackme_schema_upgrade_2131),
(2132, trackme_schema_upgrade_2132),
(2300, trackme_schema_upgrade_2300),
(2304, trackme_schema_upgrade_2304),
(2305, trackme_schema_upgrade_2305),
]
# Get the current schema version
try:
schema_version = trackme_schema_get_version(
reqinfo,
self.tenant_id,
schema_version_required,
task_name,
task_instance_id,
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function trackme_schema_get_version, exception="{str(e)}"'
)
# If schema_version_required is 0 (version retrieval failed), skip upgrade logic
# to align with graceful degradation when DB Connect causes permission issues
if schema_version_required == 0:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema_version_required is 0 (version retrieval failed), skipping schema upgrade logic to prevent data corruption.'
)
# Proceed
elif not schema_version or int(schema_version) != int(schema_version_required):
#
# Backup
#
# Check and act accordingly
trackme_backup_attempted = False
# Run TrackMe backup: verify if a backup was initiated or performed during the last 24 hours, otherwise initiate a backup
trackme_backup_run = True
# recent_backup_events_count
recent_backup_events_count = 0
# recent_backup_events_raw
recent_backup_events_raw = []
# run a Splunk search to identify the last backup initiated time
search = remove_leading_spaces(
f"""\
search (index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth task=schema_upgrade "initiating backup now") OR (index=_internal sourcetype=trackme:rest_api trackme.rest.backup_and_restore trackme_rest_handler_backup_and_restore.py post_backup "Backup archive created successfully") | stats count, values(_raw) as last_events
"""
)
# kwargs
kwargs_search = {
"earliest_time": "-24h",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, inspecting logs to identify any recent backups.'
)
try:
reader = run_splunk_search(
self.service,
search,
kwargs_search,
24,
5,
)
for item in reader:
if isinstance(item, dict):
recent_backup_events_count = int(item.get("count", 0))
recent_backup_events_raw = item.get("last_events", [])
except Exception as e:
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recent backup identification search failed with exception="{str(e)}"'
logging.error(msg)
# if we have detected a recent backup, we will not run a backup
if recent_backup_events_count > 0:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recent backup was detected, no backup will be initiated, recent_backup_events_count="{recent_backup_events_count}", recent_backup_events_raw="{recent_backup_events_raw}"'
)
trackme_backup_run = False
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no recent backup was detected, initiating backup now, recent_backup_events_count="{recent_backup_events_count}", recent_backup_events_raw="{recent_backup_events_raw}"'
)
# before running the first function, execute TrackMe's builtin backup job
if trackme_backup_run:
if not trackme_backup_attempted:
try:
response = session.post(
f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/backup_and_restore/backup",
data=json.dumps(
{
"comment": f"Backup initiated for schema migration from version {schema_version} to {schema_version_required}"
}
),
verify=False,
timeout=900,
)
if response.status_code not in (200, 201, 204):
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call executed successfully'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call has failed, exception="{str(e)}"'
)
trackme_backup_attempted = True
#
# schema upgrade
#
for version, upgrade_func in schema_upgrades:
if not schema_version or int(schema_version) < version:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, detected migration required for schema version {version}, schema_version="{schema_version}", schema_version_required="{schema_version_required}", processing now.'
)
# proceed
try:
schema_version_update = upgrade_func(
reqinfo,
self.tenant_id,
int(schema_version),
int(schema_version_required),
task_name,
task_instance_id,
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema version {version} migrated successfully.'
)
# Update schema version after each successful upgrade
try:
schema_version_update = trackme_schema_update_version(
reqinfo,
self.tenant_id,
version, # Update to current version being processed
task_name,
task_instance_id,
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema version updated to {version} after successful upgrade.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to update schema version to {version}, exception="{str(e)}"'
)
raise # Re-raise the exception to stop the upgrade process
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function {upgrade_func.__name__}, exception="{str(e)}"'
)
raise # Re-raise the exception to stop the upgrade process
#
# finally migrate the schema version to the required version if not already there
#
try:
if int(schema_version) != int(schema_version_required):
schema_version_update = trackme_schema_update_version(
reqinfo,
self.tenant_id,
schema_version_required,
task_name,
task_instance_id,
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, final schema version updated to {schema_version_required}.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function trackme_schema_update_version, exception="{str(e)}"'
)
#
# check if the vtenant is the last enabled vtenant to be upgraded, if so we will execute the general health tracker
#
vtenants_records = collection.data.query()
vtenants_remaining_count = 0
# iterate through vtenant records, count remaining vtenants to be upgraded
for record in vtenants_records:
schema_version_raw = record.get("schema_version")
# If schema_version is None (e.g., tenant was created when version retrieval failed),
# treat it as needing an upgrade
if schema_version_raw is None:
schema_version_needs_upgrade = True
else:
schema_version_needs_upgrade = int(schema_version_raw) != int(schema_version_required)
if (
schema_version_needs_upgrade
and record.get("tenant_status") == "enabled"
):
vtenants_remaining_count += 1
if vtenants_remaining_count == 0:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, all vtenants are up to date, executing the general health tracker'
)
try:
reader = run_splunk_search(
self.service,
"| savedsearch trackme_general_health_manager",
{
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
},
24,
5,
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, general health tracker executed successfully'
)
except Exception as e:
msg = f'permanently failed to execute the general health tracker search, exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema is up to date, no action required, schema_version="{schema_version}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
#
#
#
# all components - inspect_collection
#
# context: this activty verifies that the collection record object statuses are consistent according to the Decision Maker
# It works by loading the component dta, then looping trough objects to verify and update their collection status if needed
for component in ("dsm", "dhm", "mhm", "wlk", "flx", "fqm"):
if vtenant_record.get(f"tenant_{component}_enabled") == True:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting collection records object statuses now.'
)
# set collection target
inspect_collection_name = (
f"kv_trackme_{component}_tenant_{self.tenant_id}"
)
inspect_collection = self.service.kvstore[inspect_collection_name]
#
# subtask: permanently_deleted_records_inspection
#
task_instance_id = self.get_uuid()
task_start = time.time()
task_name = "inspect_collection:permanently_deleted_records_inspection"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
#
# Check permanently deleted records:
# A permanently deleted record should not exist in the main KVstore collection, if it does, it should be purged
#
# Lists to store permanently deleted records found in anomaly
collection_permanently_deleted_records_anomaly = []
# search
search = remove_leading_spaces(
f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
| lookup trackme_common_permanently_deleted_objects_tenant_{self.tenant_id} object, object_category OUTPUT _key as permanently_deleted_keys
| where isnotnull(permanently_deleted_keys)
| table keyid, *
"""
)
# kwargs
kwargs_search = {
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for permanently deleted records now.'
)
try:
reader = run_splunk_search(
self.service,
search,
kwargs_search,
24,
5,
)
for item in reader:
if isinstance(item, dict):
collection_permanently_deleted_records_anomaly.append(item)
except Exception as e:
msg = f'permanently deleted records inspection search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
if len(collection_permanently_deleted_records_anomaly) > 0:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, permanently deleted records found, no_records="{len(collection_permanently_deleted_records_anomaly)}"'
)
for record in collection_permanently_deleted_records_anomaly:
try:
inspect_collection.data.delete(
json.dumps({"_key": record.get("keyid")})
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities in the main collection which are also in ther permanently deleted records were purged successfully, keyid="{record.get("keyid")}", record="{json.dumps(record, indent=1)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to delete permanently deleted records in anmaly, keyid="{record.get("keyid")}", , record="{json.dumps(record, indent=1)}", exception="{str(e)}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no permanenlty deleted records in anomaly found'
)
#
# Check for any duplicated records in the permanently deleted records collection, based on the object field
#
permanently_deleted_records_collection_name = f"kv_trackme_common_permanently_deleted_objects_tenant_{self.tenant_id}"
permanently_deleted_records_collection = self.service.kvstore[permanently_deleted_records_collection_name]
(
permanently_deleted_records,
permanently_deleted_collection_keys,
permanently_deleted_collection_dict,
) = get_full_kv_collection(
permanently_deleted_records_collection, permanently_deleted_records_collection_name
)
# Detect duplicated records (same "(object, object_category)") and collect keys to delete (keep first seen)
duplicated_pd_keys = []
seen_pairs = set()
for pd_key, pd_record in permanently_deleted_collection_dict.items():
object_value = pd_record.get("object")
object_category = pd_record.get("object_category")
if not object_value or not object_category:
continue
pair = (object_value, object_category)
if pair in seen_pairs:
duplicated_pd_keys.append(pd_key)
else:
seen_pairs.add(pair)
if len(duplicated_pd_keys) > 0:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, permanently deleted records collection has duplicates, duplicates_count="{len(duplicated_pd_keys)}"'
)
for pd_key in duplicated_pd_keys:
try:
permanently_deleted_records_collection.data.delete(json.dumps({"_key": pd_key}))
# best-effort to fetch object for logging
pd_record = permanently_deleted_collection_dict.get(pd_key, {})
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicate in permanently deleted records purged successfully, keyid="{pd_key}", object="{pd_record.get("object")}", object_category="{pd_record.get("object_category")}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to purge duplicate in permanently deleted records, keyid="{pd_key}", exception="{str(e)}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no duplicates found in permanently deleted records collection'
)
# end subtask
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
# subtask: corrupted_records_inspection
#
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "inspect_collection:corrupted_records_inspection"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
#
# Check for unexpected corrupted record, a foreign record which has been stored in the KVstore by mistake
# would not have an object value, and would be purged if any.
#
# Lists to store corrupted records
collection_corrupted_records = []
# search
search = remove_leading_spaces(
f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
| where isnull(object) OR object=""
| table keyid, *
"""
)
# kwargs
kwargs_search = {
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for corrupted records now.'
)
try:
reader = run_splunk_search(
self.service,
search,
kwargs_search,
24,
5,
)
for item in reader:
if isinstance(item, dict):
collection_corrupted_records.append(item)
except Exception as e:
msg = f'corrupted record inspection search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
if len(collection_corrupted_records) > 0:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, corrupted records found, no_records="{len(collection_corrupted_records)}"'
)
for corrupted_record in collection_corrupted_records:
try:
inspect_collection.data.delete(
json.dumps({"_key": corrupted_record.get("keyid")})
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, corrupted record deleted successfully, keyid="{corrupted_record.get("keyid")}", record="{json.dumps(corrupted_record, indent=1)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to delete corrupted record, keyid="{corrupted_record.get("keyid")}", , record="{json.dumps(corrupted_record, indent=1)}", exception="{str(e)}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no corrupted records found'
)
# end subtask
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
# subtask: missing_tenant_id_records_inspection
#
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "inspect_collection:missing_tenant_id_records_inspection"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
#
# Check for records which would miss the tenant_id field, and add it if needed
#
# collection_misggin_tenant_id_records records
collection_missing_tenant_id_records = []
# search
search = remove_leading_spaces(
f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
| where isnull(tenant_id) OR tenant_id=""
| table keyid, *
"""
)
# kwargs
kwargs_search = {
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for corrupted records now.'
)
try:
reader = run_splunk_search(
self.service,
search,
kwargs_search,
24,
5,
)
for item in reader:
if isinstance(item, dict):
collection_missing_tenant_id_records.append(item)
except Exception as e:
msg = f'missing tenant_id record inspection search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
if len(collection_missing_tenant_id_records) > 0:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, records found, no_records="{len(collection_missing_tenant_id_records)}"'
)
for missing_record in collection_missing_tenant_id_records:
try:
missing_record["tenant_id"] = self.tenant_id
inspect_collection.data.update(
missing_record.get("_key"),
json.dumps(missing_record),
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, record updated successfully, keyid="{missing_record.get("keyid")}", record="{json.dumps(missing_record, indent=1)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to update record, keyid="{missing_record.get("keyid")}", , record="{json.dumps(missing_record, indent=1)}", exception="{str(e)}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no records found'
)
# end subtask
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
# subtask: entities_auto_disablement
#
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "inspect_collection:entities_auto_disablement"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
#
# Check for feeds entities to be disabled according to the system wide setting: splk_general_feeds_auto_disablement_period
# This setting allows to disable feeds entities if they have not been updated for a certain period of time
#
# system wide setting
try:
splk_general_feeds_auto_disablement_period = reqinfo["trackme_conf"][
"splk_general"
]["splk_general_feeds_auto_disablement_period"]
except Exception as e:
logging.warning(f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to get system wide setting, splk_general_feeds_auto_disablement_period, using default value, exception="{str(e)}"')
splk_general_feeds_auto_disablement_period = "90d"
# tenant setting (override system wide setting, if set)
try:
splk_feeds_auto_disablement_period = vtenant_account.get(
"splk_feeds_auto_disablement_period"
)
except Exception as e:
logging.warning(f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to get tenant setting, splk_feeds_auto_disablement_period, using system wide setting, exception="{str(e)}"')
splk_feeds_auto_disablement_period = splk_general_feeds_auto_disablement_period
# handle
auto_disablement_period = (
splk_feeds_auto_disablement_period
if splk_feeds_auto_disablement_period
else splk_general_feeds_auto_disablement_period
)
if auto_disablement_period != "0d" and component in (
"dsm",
"dhm",
"mhm",
):
# Lists to store entities to be disabled
entities_to_be_disabled = []
# search
search = remove_leading_spaces(
f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
| eval last_time_seen=coalesce(data_last_time_seen, metric_last_time_seen)
| where last_time_seen<=relative_time(now(), "-{auto_disablement_period}")
| table keyid, object, last_time_seen
| eval last_time_seen_human=strftime(last_time_seen, "%c")
"""
)
# kwargs
kwargs_search = {
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for entities to be disabled according to auto-disablement setting. (auto_disablement_period="{auto_disablement_period}")'
)
try:
reader = run_splunk_search(
self.service,
search,
kwargs_search,
24,
5,
)
for item in reader:
if isinstance(item, dict):
entities_to_be_disabled.append(item.get("keyid"))
except Exception as e:
msg = f'auto-disablement record inspection search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
if len(entities_to_be_disabled) > 0:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities to be disabled were found, list="{entities_to_be_disabled}"'
)
# turn entities_to_be_disabled list into CSV
entities_to_be_disabled_csv = ",".join(entities_to_be_disabled)
# call mass disablement endpoint
if component == "dsm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_monitoring"
elif component == "dhm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dhm/write/dh_monitoring"
elif component == "mhm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_mhm/write/mh_monitoring"
try:
response = session.post(
target_url,
data=json.dumps(
{
"tenant_id": self.tenant_id,
"keys_list": entities_to_be_disabled_csv,
"action": "disable",
"update_comment": f"auto-disabled by the system, last seen data is beyond the system wide auto-disablement period of {splk_general_feeds_auto_disablement_period}",
}
),
verify=False,
timeout=600,
)
if response.status_code not in (200, 201, 204):
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, query has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
logging.error(msg)
else:
try:
success_count = response.json().get("success_count")
except Exception as e:
success_count = 0
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, request was successful, success_count="{success_count}"'
logging.info(msg)
except Exception as e:
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, ctask="{task_name}", task_instance_id={task_instance_id}, request failed with exception="{str(e)}"'
logging.info(msg)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no entities to be disabled were found'
)
# end subtask
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
# subtask: handle_sync_entities
#
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "inspect_collection:handle_sync_entities"
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
#
# Inspecting statuses
#
#
# START raw collections records: Get raw collection records using a Splunk search
#
# search
search = remove_leading_spaces(
f"""\
| trackmegetcoll tenant_id="{self.tenant_id}" component="{component}" | fields - _raw | table *
| lookup trackme_{component}_tenant_{self.tenant_id} _key as keyid OUTPUT object_state as kvcoll_object_state, anomaly_reason as kvcoll_anomaly_reason, latest_flip_time as kvcoll_latest_flip_time
| where object_state!=kvcoll_object_state
"""
)
# kwargs
kwargs_search = {
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
delta_records = []
delta_records_keys = set()
delta_records_objects = set()
delta_records_dict = {}
try:
reader = run_splunk_search(
self.service,
search,
kwargs_search,
24,
5,
)
for item in reader:
if isinstance(item, dict):
delta_records.append(item)
delta_records_keys.add(item.get("keyid"))
delta_records_objects.add(item.get("object"))
delta_records_dict[item.get("keyid")] = item
except Exception as e:
msg = f'main search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
#
# END raw collections records: Get raw collection records using a Splunk search
#
#
# Handle delta records
#
inspectcollection_compare_records_start_time = time.time()
for item in delta_records:
item_key = item.get("keyid")
item_object = decode_unicode(item.get("object"))
item_alias = item.get("alias")
item_object_state = item.get("object_state")
item_object_category = item.get("object_category")
item_anomaly_reason = item.get("anomaly_reason")
item_monitored_state = item.get("monitored_state")
item_priority = item.get("priority")
# our delta state
collection_object_state = item.get("kvcoll_object_state")
# previous_anomaly_reason
collection_anomaly_reason = item.get(
"kvcoll_anomaly_reason", "unknown"
)
# previous flip time
try:
collection_latest_flip_time = float(
item.get("kvcoll_latest_flip_time", 0)
)
except Exception as e:
collection_latest_flip_time = 0
# disruption time
disruption_time = 0
# compare the object state with item_object_state using decisionmaker_collection_records_dict using the key
# if the object_state value is different, log the issue
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, collection record object state is not consistent, object="{item_object}", object_id="{item_key}", in_collection_object_state="{collection_object_state}", in_result_object_state="{item_object_state}", in_collection_anomaly_reason="{collection_anomaly_reason}"'
)
# get the current kvrecord
kvrecord_updated = False
try:
kvrecord = inspect_collection.data.query(
query=json.dumps({"_key": item_key})
)[0]
# update the kvrecord object_state, status_message and anomaly_reason
kvrecord["object_state"] = item_object_state
kvrecord["status_message"] = item.get("status_message")
kvrecord["anomaly_reason"] = item_anomaly_reason
kvrecord["mtime"] = time.time()
kvrecord["latest_flip_time"] = time.time()
kvrecord["latest_flip_state"] = item_object_state
# process the KVstore record update
inspect_collection.data.update(item_key, json.dumps(kvrecord))
kvrecord_updated = True
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, collection record object update successfully, object="{item_object}", object_id="{item_key}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to update the KVstore record, object="{item_object}", collection_name="{collection_name}", exception="{str(e)}"'
)
# proceeed with next steps
if kvrecord_updated:
try:
# calculate disruption time if current_state is green and previous_state was red
if (
item_object_state == "green"
and collection_object_state == "red"
):
try:
disruption_time = round(
(time.time() - collection_latest_flip_time),
2,
)
except Exception as e:
disruption_time = 0
flip_timestamp = time.strftime(
"%d/%m/%Y %H:%M:%S",
time.localtime(time.time()),
)
disruption_time_str = f', disruption_time="{disruption_time}"' if disruption_time and disruption_time > 0 else ""
flip_result = f'{flip_timestamp}, object="{item_object}" has flipped from previous_state="{collection_object_state}" to state="{item_object_state}" with anomaly_reason="{item_anomaly_reason}", previous_anomaly_reason="{collection_anomaly_reason}"{disruption_time_str}'
flip_record = {
"timeStr": flip_timestamp,
"tenant_id": self.tenant_id,
"alias": item_alias,
"keyid": item_key,
"object": item_object,
"object_category": item_object_category,
"object_state": item_object_state,
"object_previous_state": collection_object_state,
"priority": item_priority,
"latest_flip_time": time.time(),
"latest_flip_state": item_object_state,
"anomaly_reason": item_anomaly_reason,
"result": flip_result,
}
# add event_id
flip_record["event_id"] = hashlib.sha256(
json.dumps(flip_record).encode()
).hexdigest()
trackme_gen_state(
index=tenant_indexes["trackme_summary_idx"],
sourcetype="trackme:flip",
source="flip_state_change_tracking",
event=flip_record,
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, TrackMe flipping event created successfully, record="{json.dumps(flip_record, indent=1)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, object="{item_object}", task="{task_name}", task_instance_id={task_instance_id}, record="{json.dumps(flip_record, indent=1)}", failed to generate a flipping state event with exception="{e}"'
)
#
# SLA metrics
#
# create a list for SLA metrics generation
sla_metrics_records = []
if item_object_state == "green":
object_num_state = 1
elif item_object_state == "red":
object_num_state = 2
elif item_object_state == "orange":
object_num_state = 3
elif item_object_state == "blue":
object_num_state = 4
else:
object_num_state = 5
# add to our list
sla_metrics_records.append(
{
"tenant_id": self.tenant_id,
"object_id": item_key,
"object": item_object,
"alias": item_alias,
"object_category": item_object_category,
"monitored_state": item_monitored_state,
"priority": item_priority,
"metrics_event": {"object_state": object_num_state},
}
)
# call the SLA gen metrics function
sla_metrics_gen_start = time.time()
try:
sla_metrics = trackme_sla_gen_metrics(
self.tenant_id,
tenant_indexes.get("trackme_metric_idx"),
sla_metrics_records,
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function trackme_sla_gen_metrics success {sla_metrics}, run_time={round(time.time()-sla_metrics_gen_start, 3)}, no_entities={len(sla_metrics_records)}'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function trackme_sla_gen_metrics failed with exception {str(e)}'
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no_delta_records="{len(delta_records_keys)}", run_time="{round((time.time() - inspectcollection_compare_records_start_time), 3)}", collection="{inspect_collection_name}"'
)
#
# END comparison
#
# end subtask
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
#
#
#
# Call the trackme_register_tenant_component_summary
#
# Use threading to do an async call to the register summary without waiting for it to complete
thread = threading.Thread(
target=self.register_component_summary_async,
args=(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
component,
),
)
thread.start()
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, trackme_register_tenant_component_summary was requested.'
)
#
# task: untracked_entities
#
#
# splk-dsm - untracked entities
#
# context: this activity tracks and maintain state for untracked entities
# untracked entities are entities which are entirely out of the scope of any trackers, and therefore not maintained otherwise
task_instance_id = self.get_uuid()
task_name = "untracked_entities"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
if vtenant_record.get("tenant_dsm_enabled") == True:
component = "dsm"
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting untracked entities now.'
)
# kwargs
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
untracked_entities_count = 0
untracked_entities_processed_objects = []
untracked_entities_search = f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval key=_key
``` target any entity that has not been updated since more than 15m ```
| eval time_sec_since_inspection=now()-tracker_runtime
| where ( time_sec_since_inspection>900 OR isnull(tracker_runtime) )
``` called the offline abstract macro version ```
`trackme_{component}_tracker_abstract({self.tenant_id})`
``` collects latest collection state into the summary index ```
| `trackme_collect_state("current_state_tracking:splk-{component}:{self.tenant_id}", "object", "{self.tenant_id}")`
``` output flipping change status if changes ```
| trackmesplkgetflipping tenant_id="{self.tenant_id}" object_category="splk-{component}"
``` update the KVstore collection ```
| `trackme_outputlookup_tracker_health(trackme_{component}_tenant_{self.tenant_id}, key)`
``` update the delay metric only ```
| `trackme_mcollect(object, splk-{component}, "metric_name:trackme.splk.feeds.lag_event_sec=data_last_lag_seen", "tenant_id, object_category, object", "{self.tenant_id}")`
``` summarize job ```
| stats count as report_entities_count, values(object) as objects by tenant_id
"""
# run the main report, every result is a Splunk search to be executed on its own thread
try:
reader = run_splunk_search(
self.service,
untracked_entities_search,
kwargs_oneshot,
24,
5,
)
for item in reader:
if isinstance(item, dict):
untracked_entities_count += 1
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities_count="{len(item)}"'
)
untracked_entities_processed_objects = item.get("objects", [])
if untracked_entities_count == 0:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no untracked entities currently.'
)
except Exception as e:
# Call the component register
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
"all",
report_name,
"failure",
time.time(),
str(time.time() - start),
str(e),
"-5m",
"now",
)
msg = f'task="{task_name}", task_instance_id={task_instance_id}, tenant_id="{self.tenant_id}", main search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
if untracked_entities_processed_objects:
handler_events_records = []
for object_name in untracked_entities_processed_objects:
handler_events_records.append(
{
"object": object_name,
"object_category": f"splk-{component}",
"object_id": hashlib.sha256(
object_name.encode("utf-8")
).hexdigest(),
"handler": "health_tracker:untracked_entities",
"handler_message": "Entity was inspected by the heath tracker, it is out of the scope of any hybrid tracker due to high delay and/or latency.",
"handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth tenant_id={self.tenant_id} component=splk-{component} task=untracked_entities",
"handler_time": time.time(),
}
)
# notification event
try:
trackme_handler_events(
session_key=self._metadata.searchinfo.session_key,
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
tenant_id=self.tenant_id,
sourcetype="trackme:handler",
source=f"trackme:handler:{self.tenant_id}",
handler_events=handler_events_records,
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="splk-{component}", could not send notification event, exception="{e}"'
)
#
# splk-dhm - untracked entities
#
# context: this activity tracks and maintain state for untracked entities
# untracked entities are entities which are entirely out of the scope of any trackers, and therefore not maintained otherwise
if vtenant_record.get("tenant_dhm_enabled") == True:
component = "dhm"
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting untracked entities now.'
)
# kwargs
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
untracked_entities_count = 0
untracked_entities_processed_objects = []
untracked_entities_search = f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval key=_key
``` target any entity that has not been updated since more than 15m ```
| eval time_sec_since_inspection=now()-tracker_runtime
| where ( time_sec_since_inspection>900 OR isnull(tracker_runtime) )
``` called the offline abstract macro version ```
`trackme_{component}_tracker_abstract({self.tenant_id})`
``` collects latest collection state into the summary index ```
| `trackme_collect_state("current_state_tracking:splk-{component}:{self.tenant_id}", "object", "{self.tenant_id}")`
``` output flipping change status if changes ```
| trackmesplkgetflipping tenant_id="{self.tenant_id}" object_category="splk-{component}"
``` update the KVstore collection ```
| `trackme_outputlookup_tracker_health(trackme_{component}_tenant_{self.tenant_id}, key)`
``` update the delay metric only ```
| `trackme_mcollect(object, splk-{component}, "metric_name:trackme.splk.feeds.lag_event_sec=data_last_lag_seen", "tenant_id, object_category, object", "{self.tenant_id}")`
``` summarize job ```
| stats count as report_entities_count, values(object) as objects by tenant_id
"""
# run the main report, every result is a Splunk search to be executed on its own thread
try:
reader = run_splunk_search(
self.service,
untracked_entities_search,
kwargs_oneshot,
24,
5,
)
for item in reader:
if isinstance(item, dict):
untracked_entities_count += 1
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities_count="{len(item)}"'
)
untracked_entities_processed_objects = item.get("objects", [])
if untracked_entities_count == 0:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no untracked entities currently.'
)
except Exception as e:
# Call the component register
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
"all",
report_name,
"failure",
time.time(),
str(time.time() - start),
str(e),
"-5m",
"now",
)
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
if untracked_entities_processed_objects:
# if untracked_entities_processed_objects is a string (a single object was reported), convert it to a list
if isinstance(untracked_entities_processed_objects, str):
untracked_entities_processed_objects = [
untracked_entities_processed_objects
]
handler_events_records = []
for object_name in untracked_entities_processed_objects:
handler_events_records.append(
{
"object": object_name,
"object_id": hashlib.sha256(
object_name.encode("utf-8")
).hexdigest(),
"object_category": f"splk-{component}",
"handler": "health_tracker:untracked_entities",
"handler_message": "Entity was inspected by the heath tracker, it is out of the scope of any hybrid tracker due to high delay and/or latency.",
"handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth tenant_id={self.tenant_id} component=splk-{component} task=untracked_entities",
"handler_time": time.time(),
}
)
# notification event
try:
trackme_handler_events(
session_key=self._metadata.searchinfo.session_key,
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
tenant_id=self.tenant_id,
sourcetype="trackme:handler",
source=f"trackme:handler:{self.tenant_id}",
handler_events=handler_events_records,
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="splk-{component}", could not send notification event, exception="{e}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: duplicated_entities
#
task_instance_id = self.get_uuid()
task_name = "duplicated_entities"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# all components except splk-wlk
#
# context: this situation is not expected, but if we have duplicated entities, we need to verify and purge them
# splk-wlk - duplicated entities
#
# context: this activity tracks for duplicated entities in the Workload component
# under some rare circumstances, the Splunk scheduler logs may lack the user context, althrough we implement several safeties
# if this happens, we need to verify and purge any duplicated entity with the system user context instead of the proper user context
for component in ("dsm", "dhm", "mhm", "wlk", "flx", "fqm"):
if (
vtenant_record.get(f"tenant_{component}_enabled") == True
and vtenant_record.get("tenant_replica") == False
):
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting entities now.'
)
# kwargs
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
duplicated_entities_count = 0
duplicated_entities_list = []
# specific search for wlk
if component == "wlk":
duplicated_entities_search = remove_leading_spaces(
f"""\
| inputlookup trackme_wlk_tenant_{self.tenant_id} | eval keyid=_key
| fields keyid, account, app, user, savedsearch_name, object, last_seen
| eventstats count as dcount by account, app, savedsearch_name
| where dcount>1
| sort - 0 savedsearch_name, last_seen
"""
)
else: # other components
duplicated_entities_search = remove_leading_spaces(
f"""\
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
| sort 0 object
| eventstats count as dcount by object
| streamstats count as rank by object
| where dcount>1
``` handle rank if the duplicated is due to FIPS migration ```
| eval rank=if(len(keyid) == 64, 2, 1)
| where rank=1
"""
)
# run the main report, every result is a Splunk search to be executed on its own thread
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, executing search="{duplicated_entities_search}"'
)
try:
reader = run_splunk_search(
self.service,
duplicated_entities_search,
kwargs_oneshot,
24,
5,
)
for item in reader:
if isinstance(item, dict):
duplicated_entities_count += 1
duplicated_entities_list.append(item.get("keyid"))
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, detected duplicated entity, keyid="{item.get("keyid")}", object="{item.get("object")}"'
)
if duplicated_entities_count == 0:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no duplicated entities currently.'
)
except Exception as e:
# Call the component register
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
"all",
report_name,
"failure",
time.time(),
str(time.time() - start),
str(e),
"-5m",
"now",
)
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
# process if needed
if duplicated_entities_count > 0:
# target
if component == "dsm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_delete"
elif component == "dhm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dhm/write/dh_delete"
if component == "mhm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_mhm/write/mh_delete"
if component == "flx":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_flx/write/flx_delete"
if component == "fqm":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_fqm/write/fqm_delete"
if component == "wlk":
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_wlk/write/wlk_delete"
# data
# turn duplicated_entities_list into a comma separated string
# update comment
if component == "wlk":
update_comment = "One or more duplicated entities were detected by the health tracker, this condition can happen when Splunk scheduler logs lack the user context, automated purge of these entities."
else:
update_comment = "One or more duplicated entities were detected by the health tracker, this condition is not expected and TrackMe needs to purge duplicates to avoid further issues."
duplicated_entities_list = ",".join(duplicated_entities_list)
post_data = {
"tenant_id": self.tenant_id,
"keys_list": duplicated_entities_list,
"deletion_type": "temporary",
"update_comment": update_comment,
}
try:
response = session.post(
target_url,
data=json.dumps(post_data),
verify=False,
timeout=600,
)
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicated entities purge successful, results="{json.dumps(response.json(), indent=2)}"'
logging.info(msg)
except Exception as e:
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicated entities purge failed with exception="{str(e)}"'
logging.info(msg)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: check_trackers_collections
#
# this task is designed to verify that trackers referenced in the dedicated collections are still present in the system
# if not, it will remove the tracker from the collection
task_instance_id = self.get_uuid()
task_name = "check_trackers_collections"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
def check_trackers_existence(vtenant_record, component):
logging.info(f"Checking tracker definitions for component: {component}")
# Load the tracker collection associated with the component (source of truth)
tracker_collection_name = (
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
)
tracker_collection = self.service.kvstore[tracker_collection_name]
# Get all the tracker records
tracker_records = tracker_collection.data.query()
for tracker_record in tracker_records:
record_knowledge_objects = json.loads(
tracker_record.get("knowledge_objects", "{}")
)
# get the reports list
reports_list = record_knowledge_objects.get("reports", [])
# identify the main tracker (tracker_main_name) which contains _tracker_tenant_ in the name
tracker_main_name = None
for report_name in reports_list:
if "_tracker_tenant_" in report_name:
tracker_main_name = report_name
break
# Verify the existence of the main tracker, if it cannot be found in the system, the entire record will be removed from the collection
purge_tracker_record = False
# the main tracker was found in the record
if tracker_main_name:
# process
savedsearch_definition = None
try:
savedsearch = self.service.saved_searches[tracker_main_name]
savedsearch_definition = savedsearch.content["search"]
savedsearch_content = savedsearch.content
except Exception as e:
savedsearch_definition = None
savedsearch_content = {}
# purge if necessary
if not savedsearch_definition:
purge_tracker_record = True
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the main tracker="{tracker_main_name}" does not exist anymore, the tracker record will be removed from the collection.'
)
else: # the main tracker was not found in the record, the record is considered as invalid and will be removed from the collection
purge_tracker_record = True
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record="{tracker_record}" is invalid, the tracker record will be removed from the collection.'
)
# purge if necessary
if purge_tracker_record:
try:
tracker_collection.data.delete(
json.dumps({"_key": tracker_record.get("_key")})
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record was successfully removed from the collection.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record failed to be removed from the collection, exception="{str(e)}"'
)
def recreate_missing_tracker_records(vtenant_record, component):
"""
Recreate hybrid tracker records in dedicated KVstore if they exist in
tenant_<component>_hybrid_objects but are missing from the dedicated collection.
"""
logging.info(f"Checking for missing tracker records to recreate for component: {component}")
# Load the tenant hybrid objects from vtenant_record (central source)
hybrid_objects_json = vtenant_record.get(f"tenant_{component}_hybrid_objects")
if not hybrid_objects_json:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, No hybrid objects found in vtenant_record for component "{component}", skipping recreation check.'
)
return
try:
hybrid_objects = json.loads(hybrid_objects_json)
except Exception as e:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to parse hybrid_objects JSON, exception="{str(e)}"'
)
return
reports_list = hybrid_objects.get("reports", [])
macros_list = hybrid_objects.get("macros", [])
if not reports_list:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, No reports found in hybrid_objects for component "{component}", skipping recreation check.'
)
return
# Load the dedicated tracker collection
tracker_collection_name = (
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
)
tracker_collection = self.service.kvstore[tracker_collection_name]
# Get existing tracker records from dedicated collection
existing_tracker_records = tracker_collection.data.query()
existing_tracker_names = set()
for record in existing_tracker_records:
tracker_name = record.get("tracker_name")
if tracker_name:
existing_tracker_names.add(tracker_name)
# Process wrapper reports to extract tracker names
# Pattern: trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
wrapper_prefix = f"trackme_{component}_hybrid_"
wrapper_suffix = f"_wrapper_tenant_{self.tenant_id}"
# Track trackers we've already processed to avoid duplicates
processed_trackers = {}
for report_name in reports_list:
# Only process wrapper reports to identify trackers
if "_wrapper_" not in report_name:
continue
# Extract tracker_name from wrapper report name
# Pattern: trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
if report_name.startswith(wrapper_prefix) and report_name.endswith(wrapper_suffix):
# Remove prefix and suffix to get tracker_name
tracker_name = report_name[len(wrapper_prefix):-len(wrapper_suffix)]
# Check if this tracker exists in the dedicated collection
if tracker_name not in existing_tracker_names and tracker_name not in processed_trackers:
# Collect all reports and macros for this tracker
tracker_reports = []
tracker_macros = []
# Find all reports that belong to this tracker
# Use explicit expected report name construction for precise matching
# This avoids issues with reserved words (abstract, wrapper, tracker) and substring matches
# Reports patterns vary by component:
# - Components with abstract (dsm, dhm, mhm):
# * trackme_<component>_hybrid_abstract_<tracker_name>_tenant_<tenant_id>
# * trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
# * trackme_<component>_hybrid_<tracker_name>_tracker_tenant_<tenant_id>
# - Components without abstract (flx, wlk, fqm):
# * trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
# * trackme_<component>_hybrid_<tracker_name>_tracker_tenant_<tenant_id>
# Construct expected report names explicitly for exact matching
expected_reports = []
# Components with abstract reports: dsm, dhm, mhm
if component in ["dsm", "dhm", "mhm"]:
expected_reports.append(f"trackme_{component}_hybrid_abstract_{tracker_name}_tenant_{self.tenant_id}")
# All components have wrapper and tracker reports
expected_reports.append(f"trackme_{component}_hybrid_{tracker_name}_wrapper_tenant_{self.tenant_id}")
expected_reports.append(f"trackme_{component}_hybrid_{tracker_name}_tracker_tenant_{self.tenant_id}")
# Match reports using exact names
for report in reports_list:
if report in expected_reports:
tracker_reports.append(report)
# Find all macros that belong to this tracker
# Note: Macros are only applicable to dsm, dhm, mhm components
# Macro pattern: trackme_<component>_hybrid_root_constraint_<tracker_name>_tenant_<tenant_id>
# Use exact expected macro name for matching (similar to reports above)
if component in ["dsm", "dhm", "mhm"]:
expected_macro = f"trackme_{component}_hybrid_root_constraint_{tracker_name}_tenant_{self.tenant_id}"
if expected_macro in macros_list:
tracker_macros.append(expected_macro)
# Only proceed if we have at least one report
if tracker_reports:
processed_trackers[tracker_name] = {
"reports": tracker_reports,
"macros": tracker_macros
}
# Recreate missing tracker records
for tracker_name, knowledge_data in processed_trackers.items():
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Recreating missing tracker record for "{tracker_name}" in dedicated collection.'
)
# Build knowledge_objects structure (without properties as per requirement)
knowledge_objects = {
"reports": knowledge_data["reports"]
}
# Add macros if present (only for components that use them)
if knowledge_data["macros"]:
knowledge_objects["macros"] = knowledge_data["macros"]
# Create the tracker record
new_tracker_record = {
"_key": hashlib.sha256(tracker_name.encode("utf-8")).hexdigest(),
"tracker_name": tracker_name,
"knowledge_objects": json.dumps(knowledge_objects, indent=2),
"created_time": time.time(),
"created_by": "health_tracker"
}
# Add component-specific fields
if component == "wlk":
# wlk tracker records require tracker_type field
# tracker_name format is: {tracker_type}_{uuid}
# Extract tracker_type from tracker_name
# Note: Some tracker types contain underscores (e.g., inactive_entities, splunkcloud_svc)
# so we need to check for multi-word types first before falling back to simple split
valid_wlk_tracker_types = [
"main", "introspection", "scheduler", "metadata",
"orphan", "inactive_entities", "splunkcloud_svc", "notable"
]
extracted_tracker_type = None
# First, try to match known multi-word tracker types
for valid_type in valid_wlk_tracker_types:
if tracker_name.startswith(valid_type + "_") or tracker_name == valid_type:
extracted_tracker_type = valid_type
break
# If no match found, fall back to simple split for single-word types
if not extracted_tracker_type and "_" in tracker_name:
first_segment = tracker_name.split("_", 1)[0]
if first_segment in valid_wlk_tracker_types:
extracted_tracker_type = first_segment
if extracted_tracker_type:
new_tracker_record["tracker_type"] = extracted_tracker_type
else:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Could not extract valid tracker_type from tracker_name="{tracker_name}" (expected format: tracker_type_uuid)'
)
elif component in ["flx", "fqm"]:
# flx and fqm use tracker_id field
new_tracker_record["tracker_id"] = tracker_name
try:
# Final safety check: verify the tracker doesn't exist before insertion
final_check = tracker_collection.data.query(
query=json.dumps({"tracker_name": tracker_name})
)
if not final_check:
tracker_collection.data.insert(json.dumps(new_tracker_record))
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Successfully recreated tracker record for "{tracker_name}" in dedicated collection.'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{tracker_name}" already exists in dedicated collection, skipping recreation.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to recreate tracker record for "{tracker_name}", exception: {str(e)}'
)
# Main logic
components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]
for component in components:
if vtenant_record.get(f"tenant_{component}_enabled"):
check_trackers_existence(vtenant_record, component)
recreate_missing_tracker_records(vtenant_record, component)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: check_trackers
#
task_instance_id = self.get_uuid()
task_name = "check_trackers_definition"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
def check_trackers_definition(vtenant_record, component):
logging.info(f"Checking tracker definitions for component: {component}")
# Load the tracker collection associated with the component (source of truth)
tracker_collection_name = (
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
)
tracker_collection = self.service.kvstore[tracker_collection_name]
# Get all the tracker records
tracker_records = tracker_collection.data.query()
# Initialize empty sets for the reports and macros that should be in the vtenant_record
truth_reports = set()
truth_macros = set()
for tracker_record in tracker_records:
record_knowledge_objects = json.loads(
tracker_record.get("knowledge_objects", "{}")
)
# Collect the reports and macros from the tracker record's knowledge_objects
truth_reports.update(record_knowledge_objects.get("reports", []))
truth_macros.update(record_knowledge_objects.get("macros", []))
# Load the current tenant hybrid objects from vtenant_record (destination)
hybrid_objects_json = vtenant_record.get(
f"tenant_{component}_hybrid_objects"
)
if hybrid_objects_json:
# Load the JSON object from the hybrid_objects field
hybrid_objects = json.loads(hybrid_objects_json)
else:
# If no existing hybrid_objects, initialize an empty structure
hybrid_objects = {"reports": [], "macros": []}
vtenant_reports = set(hybrid_objects.get("reports", []))
vtenant_macros = set(hybrid_objects.get("macros", []))
# Compare and find missing reports/macros in the vtenant_record
missing_reports = truth_reports - vtenant_reports
missing_macros = truth_macros - vtenant_macros
# If there are any missing reports or macros, add them to the vtenant_record
if missing_reports or missing_macros:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Missing reports: {missing_reports} or macros: {missing_macros} in vtenant_record.'
)
# Update the vtenant_record with missing reports and macros
hybrid_objects["reports"] = list(vtenant_reports.union(truth_reports))
hybrid_objects["macros"] = list(vtenant_macros.union(truth_macros))
# Save the updated hybrid objects back to the vtenant_record
vtenant_record[f"tenant_{component}_hybrid_objects"] = json.dumps(
hybrid_objects, indent=2
)
try:
self.service.kvstore["kv_trackme_virtual_tenants"].data.update(
str(vtenant_record["_key"]), json.dumps(vtenant_record)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
)
def check_trackers_existence_in_dedicated_kvstore(vtenant_record, component):
logging.info(f"Checking tracker existence in dedicated KVstore for component: {component}")
# Load the central KVstore collection to get all tracker records
central_collection_name = f"kv_trackme_{component}_tenant_{self.tenant_id}"
try:
central_collection = self.service.kvstore[central_collection_name]
central_records = central_collection.data.query()
except Exception as e:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Central collection "{central_collection_name}" not found or accessible, exception: {str(e)}'
)
return
# Load the dedicated tracker collection
tracker_collection_name = (
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
)
tracker_collection = self.service.kvstore[tracker_collection_name]
# Get existing tracker records from dedicated collection
existing_tracker_records = tracker_collection.data.query()
existing_tracker_names = set()
for record in existing_tracker_records:
tracker_name = record.get("tracker_name")
if tracker_name:
existing_tracker_names.add(tracker_name)
# Track tracker names being processed in this batch to prevent duplicates
processing_tracker_names = set()
# Process each central record to find tracker names
for central_record in central_records:
tracker_name = central_record.get("tracker_name")
if not tracker_name:
continue
# Check if tracker_name is a JSON array (concurrent tracker format)
# If it's a JSON array, skip it - these are normalized tracker names, not full report names
# We only process full report names that match the hybrid pattern
try:
if isinstance(tracker_name, str):
parsed_tracker_name = json.loads(tracker_name)
if isinstance(parsed_tracker_name, list):
# This is a JSON array of normalized tracker names, skip it
# These are from concurrent trackers and don't need hybrid tracker records
continue
except (json.JSONDecodeError, TypeError):
# Not a JSON array, continue processing as a string
pass
# Extract the base tracker name by removing _wrapper_tenant_ or _tracker_tenant_ suffix
base_tracker_name = None
if "_wrapper_tenant_" in tracker_name:
base_tracker_name = tracker_name.split("_wrapper_tenant_")[0]
elif "_tracker_tenant_" in tracker_name:
base_tracker_name = tracker_name.split("_tracker_tenant_")[0]
if not base_tracker_name:
continue
# Remove the trackme_<component>_hybrid_ prefix to get the actual tracker name
# This applies to all components that follow this naming convention
expected_prefix = f"trackme_{component}_hybrid_"
if base_tracker_name.startswith(expected_prefix):
actual_tracker_name = base_tracker_name.replace(expected_prefix, "", 1)
else:
actual_tracker_name = base_tracker_name
# Check if this tracker exists in the dedicated collection (by name or ID)
# Also check if we're already processing this tracker name in this batch
if (actual_tracker_name not in existing_tracker_names and
actual_tracker_name not in processing_tracker_names):
# Add to processing set to prevent duplicates in this batch
processing_tracker_names.add(actual_tracker_name)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{actual_tracker_name}" not found in dedicated collection, creating record.'
)
# Create a new tracker record in the dedicated collection
# Build knowledge_objects with both wrapper and tracker reports
reports_list = []
# Add both wrapper and tracker reports
wrapper_name = tracker_name.replace("_tracker_tenant_", "_wrapper_tenant_")
reports_list = [wrapper_name, tracker_name]
# Build knowledge_objects structure
knowledge_objects = {
"reports": reports_list
}
# Macros are only applicable to dsm, dhm, mhm components
if component in ["dsm", "dhm", "mhm"]:
# Extract the tracker identifier from the base tracker name
# Example: trackme_dsm_hybrid_tracker-iew8hkxv -> tracker-iew8hkxv
if "_hybrid_" in base_tracker_name:
tracker_identifier = base_tracker_name.split("_hybrid_")[1]
macro_name = f"trackme_{component}_hybrid_root_constraint_{tracker_identifier}_tenant_{self.tenant_id}"
knowledge_objects["macros"] = [macro_name]
new_tracker_record = {
"tracker_name": actual_tracker_name,
"tracker_id": actual_tracker_name, # tracker_id should equal tracker_name
"knowledge_objects": json.dumps(knowledge_objects, indent=2),
"created_time": time.time(),
"created_by": "health_tracker"
}
try:
# Final safety check: verify the tracker doesn't exist before insertion
final_check = tracker_collection.data.query(query=json.dumps({"tracker_name": actual_tracker_name}))
if not final_check:
tracker_collection.data.insert(json.dumps(new_tracker_record))
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Successfully created tracker record for "{actual_tracker_name}" in dedicated collection.'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{actual_tracker_name}" already exists in dedicated collection, skipping creation.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to create tracker record for "{actual_tracker_name}", exception: {str(e)}'
)
# Main logic
components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]
for component in components:
if vtenant_record.get(f"tenant_{component}_enabled"):
check_trackers_definition(vtenant_record, component)
check_trackers_existence_in_dedicated_kvstore(vtenant_record, component)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: check_alerts_definition
#
task_instance_id = self.get_uuid()
task_name = "check_alerts_definition"
task_start = time.time()
#
# Verify for each tenant record the content of tenant_alert_objects
# - load the tenant_alert_objects object
# - For each alert, verify that the alert exists in the system
# - if not, remove the alert from the tenant_alert_objects object and update the record
#
def check_alerts_definition(alert_name):
# get the current search definition
try:
alert_current = self.service.saved_searches[alert_name]
alert_current_search = alert_current.content.get("search")
return True
except Exception as e:
return False
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# Load the tenant_alert_objects object
tenant_alert_objects = vtenant_record.get("tenant_alert_objects", {})
if tenant_alert_objects:
try:
tenant_alert_objects = json.loads(tenant_alert_objects)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to load tenant_alert_objects, exception: {str(e)}'
)
tenant_alert_objects = {}
# alerts is a list stored in "alerts" key
alerts = tenant_alert_objects.get("alerts", [])
# verify each alert
alerts_were_removed = False
for alert_name in alerts:
alert_exists = check_alerts_definition(alert_name)
if not alert_exists:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, alert="{alert_name}" not found in saved searches, will be removed from tenant_alert_objects'
)
alerts.remove(alert_name)
if not alerts_were_removed:
alerts_were_removed = True
# save the updated tenant_alert_objects
if alerts_were_removed:
tenant_alert_objects["alerts"] = alerts
# save the updated tenant_alert_objects
vtenant_record["tenant_alert_objects"] = json.dumps(
tenant_alert_objects, indent=2
)
try:
self.service.kvstore["kv_trackme_virtual_tenants"].data.update(
str(vtenant_record["_key"]), json.dumps(vtenant_record)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: logical_groups
#
task_instance_id = self.get_uuid()
task_name = "check_logical_groups"
task_start = time.time()
#
# Verify Logical Groups:
# - load the logical groups KVstore collection
# - verify that for each member of the groups, the member can be found in in any of the dsm/dhm/mhm/flx/fqm KVstore collection as an actively monitoreed entity
# - if not, purge the member from the group
#
def query_kvstore_for_object(member, collection_suffix):
target_collection_name = (
f"kv_trackme_{collection_suffix}_tenant_{self.tenant_id}"
)
target_collection = self.service.kvstore[target_collection_name]
query_string = {
"$and": [
{
"object": member,
"monitored_state": "enabled",
}
]
}
try:
kvrecord = target_collection.data.query(query=json.dumps(query_string))[
0
]
kvrecord_key = kvrecord.get("_key", None)
except:
kvrecord_key = None
if kvrecord_key:
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, member="{member}", found in KVstore collection="{target_collection_name}"'
)
return True
return False
if (
vtenant_record.get("tenant_dsm_enabled") == True
or vtenant_record.get("tenant_dhm_enabled") == True
or vtenant_record.get("tenant_mhm_enabled") == True
or vtenant_record.get("tenant_flx_enabled") == True
or vtenant_record.get("tenant_fqm_enabled") == True
):
# log start
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting to verify logical groups, any orphan logical group member will be purged automatically.'
)
# time counter
logical_group_check_start = time.time()
#
# Logical groups collection records
#
logical_group_coll = self.service.kvstore[
f"kv_trackme_common_logical_group_tenant_{self.tenant_id}"
]
(
logical_groups_coll_records,
logical_groups_by_group_key_dict,
logical_groups_by_group_name_list,
logical_groups_by_member_dict,
logical_groups_by_member_list,
) = get_logical_groups_collection_records(logical_group_coll)
# log all returned from the function
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_groups_coll_records={json.dumps(logical_groups_coll_records, indent=2)}, logical_groups_by_group_key_dict={json.dumps(logical_groups_by_group_key_dict, indent=2)}, logical_groups_by_group_name_list={json.dumps(logical_groups_by_group_name_list, indent=2)}, logical_groups_by_member_dict={json.dumps(logical_groups_by_member_dict, indent=2)}, logical_groups_by_member_list={json.dumps(logical_groups_by_member_list, indent=2)}'
)
# loops through logical_groups_by_member_list if not empty, then check in each KVstore collection if we have a match
logical_member_found = False
logical_members_orphans = []
# ensure logical_groups_by_member_list is a list
if isinstance(logical_groups_by_member_list, str):
logical_groups_by_member_list = [logical_groups_by_member_list]
if len(logical_groups_by_member_list) > 0:
#
# Orphans
#
for member in logical_groups_by_member_list:
for tenant_setting, collection_suffix in [
("tenant_dsm_enabled", "dsm"),
("tenant_dhm_enabled", "dhm"),
("tenant_mhm_enabled", "mhm"),
("tenant_flx_enabled", "flx"),
("tenant_fqm_enabled", "fqm"),
]:
if vtenant_record.get(tenant_setting) == True:
logical_member_found = query_kvstore_for_object(
member, collection_suffix
)
if logical_member_found:
break
if not logical_member_found:
logical_members_orphans.append(member)
# log orphans
logging.debug(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_members_orphans={json.dumps(logical_members_orphans, indent=2)}'
)
# purge orphans
if len(logical_members_orphans) > 0:
# turn the list into a comma separated string
logical_members_orphans = ",".join(logical_members_orphans)
try:
logical_group_purge_remove_response = (
logical_group_remove_object_from_groups(
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
logical_members_orphans,
)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, orphan_members="{logical_members_orphans}", successfully purged the logical groups collection, response="{json.dumps(logical_group_purge_remove_response, indent=2)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, orphan_members="{logical_members_orphans}", failed to purge from the logical groups collection, exception={str(e)}'
)
#
# empty groups
#
for logical_group_record in logical_groups_coll_records:
# get the group name
object_group_name = logical_group_record.get("object_group_name")
# get the members
members = logical_group_record.get("object_group_members", None)
if members:
if not len(members) > 0:
members = None
if not members:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", group has no members, will be purged.'
)
try:
logical_group_delete_response = (
logical_group_delete_group_by_name(
self._metadata.searchinfo.splunkd_uri,
self._metadata.searchinfo.session_key,
self.tenant_id,
object_group_name,
)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", group has been purged successfully, response="{json.dumps(logical_group_delete_response, indent=2)}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", failed to purge the group, exception={str(e)}'
)
# log time
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_groups_check_duration="{round(time.time() - logical_group_check_start, 3)}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: check_trackers
#
task_instance_id = self.get_uuid()
task_name = "check_trackers_statuses"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# Set the query
health_search = remove_leading_spaces(
f"""
| trackme mode=post url=/services/trackme/v2/configuration/get_tenant_ops_status body=\"{{'mode': 'raw', 'tenant_id': '{self.tenant_id}'}}\"
| trackmeopsstatusexpand
"""
)
# logging
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id="{self.tenant_id}", Starting health_search, report="{report_name}", search="{health_search}"'
)
# kwargs
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
# run the main report, every result is a Splunk search to be executed on its own thread
try:
reader = run_splunk_search(
self.service,
health_search,
kwargs_oneshot,
24,
5,
)
# Call the component register
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
"all",
report_name,
"success",
time.time(),
str(time.time() - start),
"The report was executed successfully",
"-5m",
"now",
)
for item in reader:
if isinstance(item, dict):
# verify the knowledge object - if for some reason it is not existing anymore, we should remove it
# and not take it into account any longer
# process
savedsearch_definition = None
report_name = item.get("report")
try:
savedsearch = self.service.saved_searches[report_name]
savedsearch_definition = savedsearch.content["search"]
savedsearch_content = savedsearch.content
except Exception as e:
savedsearch_definition = None
savedsearch_content = {}
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the report="{report_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.'
)
if not savedsearch_definition:
# extract component
component = report_name.split("_")[1]
# purge
try:
delete_register_summary = (
trackme_delete_tenant_object_summary(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
f"splk-{component}",
report_name,
)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, knowledge for the report="{report_name}" was purged successfully, response="{delete_register_summary}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while calling function trackme_delete_tenant_object_summary, exception="{str(e)}"'
)
else:
search_component = item.get("component")
search_cron_schedule = savedsearch_content.get("cron_schedule")
search_description = savedsearch_content.get("description")
search_earliest = savedsearch_content.get(
"dispatch.earliest_time"
)
search_last_duration = item.get("last_duration")
search_last_exec = item.get("last_exec")
search_last_result = item.get("last_result")
search_last_status = item.get("last_status")
search_latest = savedsearch_content.get("dispatch.latest_time")
search_report_name = report_name
search_schedule_window = savedsearch_content.get(
"schedule_window"
)
search_tenant_id = item.get("tenant_id")
search_workload_pool = savedsearch_content.get(
"workload_pool", None
)
# ACLs
acl_report_info = None
if self.get_acl:
# try to get acl
acl_link = savedsearch.links["alternate"]
acl_report_info = {}
acl_url = f"{self._metadata.searchinfo.splunkd_uri}{acl_link}/acl/list?output_mode=json"
try:
response = session.get(
acl_url,
verify=False,
timeout=600,
)
response_json = response.json()
response.raise_for_status()
acl_properties = response_json["entry"][0].get(
"acl", {}
)
acl_report_info = {
"eai:acl.owner": acl_properties.get("owner"),
"eai:acl.perms.read": acl_properties["perms"][
"read"
],
"eai:acl.perms.write": acl_properties["perms"][
"write"
],
"eai:acl.sharing": acl_properties.get("sharing"),
}
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while trying to get the ACL for the report="{report_name}", exception="{str(e)}"'
)
# set info record
search_info_record = {
"component": search_component,
"cron_schedule": search_cron_schedule,
"description": search_description,
"earliest": search_earliest,
"last_duration": search_last_duration,
"last_exec": search_last_exec,
"last_result": search_last_result,
"last_status": search_last_status,
"latest": search_latest,
"report": search_report_name,
"schedule_window": search_schedule_window,
"tenant_id": search_tenant_id,
}
# most often the workload pool is not set, only add if explicitly set
if search_workload_pool:
search_info_record["workload_pool"] = search_workload_pool
# add acl info
if acl_report_info:
search_info_record.update(acl_report_info)
yield {
"_time": time.time(),
"_raw": search_info_record,
"component": search_component,
"cron_schedule": search_cron_schedule,
"description": search_description,
"earliest": search_earliest,
"last_duration": search_last_duration,
"last_exec": search_last_exec,
"last_result": search_last_result,
"last_status": search_last_status,
"latest": search_latest,
"report": search_report_name,
"schedule_window": search_schedule_window,
"tenant_id": search_tenant_id,
"workload_pool": search_workload_pool,
}
# index the audit record
try:
trackme_state_event(
session_key=self._metadata.searchinfo.session_key,
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
tenant_id=self.tenant_id,
index=tenant_indexes["trackme_audit_idx"],
sourcetype="trackme:health",
source=f"trackme:health:{self.tenant_id}",
record=search_info_record,
)
except Exception as e:
error_msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while calling function trackme_state_event, exception="{str(e)}"'
logging.error(error_msg)
raise Exception(error_msg)
except Exception as e:
# Call the component register
trackme_register_tenant_object_summary(
session_key,
self._metadata.searchinfo.splunkd_uri,
self.tenant_id,
"all",
report_name,
"failure",
time.time(),
str(time.time() - start),
str(e),
"-5m",
"now",
)
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"'
logging.error(msg)
raise Exception(msg)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: check_tenant_record_knowledge_objects
#
task_instance_id = self.get_uuid()
task_name = "check_tenant_record_knowledge_objects"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# logic:
# For each component, check the field tenant_<component>_hybrid_objects from the vtenant record
# load the object as json, get the list reports and the list macros
# for each object, check that it actually exists in Splunk
# if not, delete the object from the vtenant record
for component in ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]:
# if the component is disabled, skip
try:
component_enablement = int(vtenant_record.get(f"tenant_{component}_enabled", 0))
except Exception as e:
component_enablement = 0
if component_enablement == 0:
continue
# get the hybrid_objects field
hybrid_objects = vtenant_record.get(
f"tenant_{component}_hybrid_objects"
)
try:
hybrid_objects = json.loads(hybrid_objects)
except Exception as e:
hybrid_objects = {}
# if the field does not exist, skip
if not hybrid_objects:
continue
# if "reports" is in the list, get the list of reports
if "reports" in hybrid_objects:
reports = hybrid_objects.get("reports")
else:
reports = []
# if "macros" is in the list, get the list of macros
if "macros" in hybrid_objects:
macros = hybrid_objects.get("macros")
else:
macros = []
# check reports
if reports:
for report_name in reports:
# process
savedsearch_definition = None
try:
savedsearch = self.service.saved_searches[report_name]
savedsearch_definition = savedsearch.content["search"]
savedsearch_content = savedsearch.content
except Exception as e:
savedsearch_definition = None
savedsearch_content = {}
# purge if necessary
if not savedsearch_definition:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the report="{report_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.'
)
# remove from list in hybrid_objects, udate the vetant record and update the KVstore collection
reports.remove(report_name)
hybrid_objects["reports"] = reports
vtenant_record[f"tenant_{component}_hybrid_objects"] = (
json.dumps(hybrid_objects, indent=2)
)
try:
self.service.kvstore[
"kv_trackme_virtual_tenants"
].data.update(
str(vtenant_record["_key"]), json.dumps(vtenant_record)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
)
# check macros
if macros:
for macro_name in macros:
# process
macro_definition = None
try:
macro = self.service.confs["macros"][macro_name]
macro_definition = macro.content["definition"]
except Exception as e:
macro = None
macro_definition = None
# purge if necessary
if not macro_definition:
logging.warning(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the macro="{macro_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.'
)
# remove from list in hybrid_objects, udate the vetant record and update the KVstore collection
macros.remove(macro_name)
hybrid_objects["macros"] = macros
vtenant_record[f"tenant_{component}_hybrid_objects"] = (
json.dumps(hybrid_objects, indent=2)
)
try:
self.service.kvstore[
"kv_trackme_virtual_tenants"
].data.update(
str(vtenant_record["_key"]), json.dumps(vtenant_record)
)
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: gen_sla_breaches_events
#
task_instance_id = self.get_uuid()
task_name = "gen_sla_breaches_events"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# get sla gen events frequency
try:
sla_breaches_events_frequency = int(
reqinfo["trackme_conf"]["sla"]["sla_breaches_events_frequency"]
)
except Exception as e:
sla_breaches_events_frequency = 86400
def process_sla_breaches_component(component, sla_breaches_events_frequency):
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, processing SLA breaches.'
)
# Get the KVstore collection for SLA notifications
collection_name = (
f"kv_trackme_{component}_sla_notifications_tenant_{self.tenant_id}"
)
collection = self.service.kvstore[collection_name]
# Run the search to get objects with SLA breaches
search_string = f'| trackmegetcoll tenant_id="{self.tenant_id}" component="{component}" | where monitored_state="enabled" | table alias object object_category object_state priority keyid sla_* anomaly_reason status_message | where sla_is_breached=1'
# kwargs
kwargs_search = {
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
}
try:
search_results = run_splunk_search(
self.service,
search_string,
kwargs_search,
24,
5,
)
for item in search_results:
if isinstance(item, dict):
try:
# Extract required fields
alias = item.get("alias")
object_value = item.get("object")
object_category = item.get("object_category")
object_state = item.get("object_state")
priority = item.get("priority")
keyid = item.get("keyid")
anomaly_reason = item.get("anomaly_reason")
status_message = item.get("status_message")
sla_class = item.get("sla_class")
sla_is_breached = item.get("sla_is_breached")
sla_message = item.get("sla_message")
sla_threshold = item.get("sla_threshold")
sla_threshold_duration = item.get("sla_threshold_duration")
sla_timer = item.get("sla_timer")
sla_timer_duration = item.get("sla_timer_duration")
# Check if we have a notification record for this object
query_string = {"_key": keyid}
try:
kvrecord = collection.data.query(
query=json.dumps(query_string)
)[0]
last_notification_time = float(kvrecord.get("mtime", 0))
current_time = time.time()
# Only generate event if last notification was > 24 hours ago
if (
current_time - last_notification_time
> sla_breaches_events_frequency
):
should_generate_event = True
else:
should_generate_event = False
except Exception:
# No record exists, we should generate an event
should_generate_event = True
last_notification_time = 0
if should_generate_event:
# Create the SLA breach event record
breach_record = {
"timeStr": time.strftime(
"%d/%m/%Y %H:%M:%S", time.localtime(time.time())
),
"tenant_id": self.tenant_id,
"alias": alias,
"object": decode_unicode(object_value),
"keyid": keyid,
"object_category": object_category,
"object_state": object_state,
"priority": priority,
"anomaly_reason": anomaly_reason,
"status_message": status_message,
"sla_class": sla_class,
"sla_is_breached": sla_is_breached,
"sla_message": sla_message,
"sla_threshold": sla_threshold,
"sla_threshold_duration": sla_threshold_duration,
"sla_timer": sla_timer,
"sla_timer_duration": sla_timer_duration,
}
# Add event_id
breach_record["event_id"] = hashlib.sha256(
json.dumps(breach_record).encode()
).hexdigest()
# Generate the event
try:
trackme_gen_state(
index=tenant_indexes["trackme_summary_idx"],
sourcetype="trackme:sla_breaches",
source=f"health_tracker:{task_name}",
event=breach_record,
)
logging.info(
f'TrackMe SLA breach event created successfully, tenant_id="{self.tenant_id}", sla_gen_events_frequency="{sla_breaches_events_frequency}", record="{json.dumps(breach_record, indent=1)}"'
)
# Update or create the notification record
notification_record = {
"_key": keyid,
"mtime": time.time(),
"last_notification": breach_record,
}
try:
collection.data.update(
keyid, json.dumps(notification_record)
)
except Exception:
collection.data.insert(
json.dumps(notification_record)
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", object="{object_value}", failed to generate a SLA breach event with exception="{e}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", failed to process record with exception="{e}"'
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", component="splk-{component}", failed to run SLA breaches search with exception="{e}"'
)
# Main logic
components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]
if (
sla_breaches_events_frequency > 0
): # only run if the frequency is greater than 0
for component in components:
if vtenant_record.get(f"tenant_{component}_enabled"):
process_sla_breaches_component(
component, sla_breaches_events_frequency
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: unclosed_stateful_incidents
#
task_instance_id = self.get_uuid()
task_name = "unclosed_stateful_incidents"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# objective: get any opened or updated incidents in the KVstore, verify that:
# - the entity associated with the incident is still existing and with monitored_state="enabled", if not the incident will be updated and closed.
# - if the entity exists, but is in a non alerting state (green, blue), and the incident is older than 24 hours, the incident will be updated and closed.
# get the KVstore collection for stateful incidents
stateful_incidents_collection_name = (
f"kv_trackme_stateful_alerting_tenant_{self.tenant_id}"
)
stateful_incidents_collection = self.service.kvstore[
stateful_incidents_collection_name
]
def get_stateful_incidents(collection_name, collection):
collection_records = []
collection_records_keys = set()
collection_dict = {}
try:
end = False
skip_tracker = 0
while end == False:
process_collection_records = collection.data.query(
skip=skip_tracker
)
if len(process_collection_records) != 0:
for item in process_collection_records:
if item.get("_key") not in collection_records_keys:
if item.get("alert_status") in ["opened", "updated"]:
collection_records.append(item)
collection_records_keys.add(item.get("object"))
collection_dict[item.get("object")] = item
skip_tracker += 500
else:
end = True
return collection_records, collection_records_keys, collection_dict
except Exception as e:
raise Exception(str(e))
# get the stateful incidents
try:
(
stateful_incidents_records,
stateful_incidents_keys,
stateful_incidents_dict,
) = get_stateful_incidents(
stateful_incidents_collection_name, stateful_incidents_collection
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call get_kv_collection, args={stateful_incidents_collection_name}, cannot process this task, exception="{str(e)}"'
)
stateful_incidents_records = []
stateful_incidents_keys = set()
stateful_incidents_dict = {}
# iterate through opened or updated incidents
for stateful_incident in stateful_incidents_records:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, inspecting stateful with _key="{stateful_incident.get("_key")}", incident="{stateful_incident}"'
)
# get the object
stateful_object = stateful_incident.get("object")
# get the object_id
stateful_object_id = stateful_incident.get("object_id")
# get the object_category (ex: splk-dsm)
stateful_object_category = stateful_incident.get("object_category")
# get the object_state
stateful_object_state = stateful_incident.get("object_state")
# get the object status
stateful_object_status = stateful_incident.get("object_status")
# get the mtime
stateful_incident_mtime = float(stateful_incident.get("mtime"))
# calculate the incident duration
stateful_incident_duration = time.time() - stateful_incident_mtime
# access the data KVstore collection
object_category_suffix = stateful_object_category.split("-")[1]
data_collection_name = (
f"kv_trackme_{object_category_suffix}_tenant_{self.tenant_id}"
)
data_collection = self.service.kvstore[data_collection_name]
# get the object from the data collection
try:
data_object = data_collection.data.query(
query=json.dumps({"_key": stateful_object_id})
)[0]
except Exception as e:
data_object = None
# use-case 1: the object does not exist anymore
stateful_object_exists = True
if not data_object:
stateful_object_exists = False
# use-case 2: the object exists, but is in a non alerting state while the incident has not been closed 24 hours later
stateful_incident_outdated = False
if stateful_object_exists:
if data_object.get("object_state", "green") in ["green", "blue"]:
if stateful_incident_duration > 86400:
stateful_incident_outdated = True
elif data_object.get("monitored_state") != "enabled":
stateful_incident_outdated = True
# Update the incident if necessary
if not stateful_object_exists or stateful_incident_outdated:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, update of outdated stateful incident is required, stateful_object_exists="{stateful_object_exists}", stateful_incident_outdated="{stateful_incident_outdated}", incident="{stateful_incident}"'
)
# update the incident
stateful_incident["alert_status"] = "closed"
stateful_incident["mtime"] = time.time()
if stateful_object_exists:
stateful_incident["object_state"] = stateful_object_status
# update the incident in the KVstore
try:
stateful_incidents_collection.data.update(
stateful_incident.get("_key"), json.dumps(stateful_incident)
)
except Exception as e:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to update stateful incident with exception="{e}"'
)
else:
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no action required against incident with _key="{stateful_incident.get("_key")}", stateful_object_exists="{stateful_object_exists}", stateful_incident_outdated="{stateful_incident_outdated}"'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
#
# task: apply_licensing_restrictions
#
task_instance_id = self.get_uuid()
task_name = "apply_licensing_restrictions"
task_start = time.time()
# start task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
#
# licensing restriction
#
# if the component is a restricted component and the product is not registered, it should be disabled now
if license_is_valid == 0 and (
vtenant_record.get("tenant_flx_enabled") == 1
or vtenant_record.get("tenant_fqm_enabled") == 1
or vtenant_record.get("tenant_wlk_enabled") == 1
):
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is running a restricted component while this instance is not registered'
)
# target
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant"
# data
post_data = {
"tenant_id": self.tenant_id,
"update_comment": "Auto disabling this tenant due to licensing limitation, the tenant is running a restricted component while the product is not currently registered",
"force": "true",
}
try:
response = session.post(
target_url,
data=json.dumps(post_data),
verify=False,
timeout=600,
)
return json.loads(response.text)
except Exception as e:
raise Exception(
f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"'
)
elif (
license_is_valid == 0
and license_active_tenants > 2
and self.tenant_id not in license_active_tenants_list[0:2]
):
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is running a restricted component while this instance is not registered'
)
# target
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant"
# data
post_data = {
"tenant_id": self.tenant_id,
"update_comment": f"Auto disabling this tenant due to licensing limitation, this deployment has reached the maximum number of tenants allowed ({license_active_tenants}), only the following tenants can be used: {license_active_tenants_list[0:2]}",
"force": "true",
}
try:
response = session.post(
target_url,
data=json.dumps(post_data),
verify=False,
timeout=600,
)
return json.loads(response.text)
except Exception as e:
raise Exception(
f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"'
)
elif (
license_is_valid == 1
and license_subscription_class == "enterprise"
and license_active_tenants > 6
and self.tenant_id not in license_active_tenants_list[0:6]
):
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is over the maximum number of allowed tenants in Enterprise Edition'
)
# target
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant"
# data
post_data = {
"tenant_id": self.tenant_id,
"update_comment": f"Auto disabling this tenant due to licensing limitation, this deployment has reached the maximum number of tenants allowed ({license_active_tenants}), only the following tenants can be used: {license_active_tenants_list[0:6]}",
"force": "true",
}
try:
response = session.post(
target_url,
data=json.dumps(post_data),
verify=False,
timeout=600,
)
return json.loads(response.text)
except Exception as e:
raise Exception(
f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"'
)
# An exception was raised while attempting to validate the license
# Log the error but do nothing
elif license_is_valid == 2:
logging.error(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, an exception was raised while attempting to validate the license, no actions will be taken for now.'
)
# end task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
)
# end general task
logging.info(
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, trackmetrackerhealth has terminated, total_run_time={round(time.time() - start, 3)}'
)
dispatch(HealthTracker, sys.argv, sys.stdin, sys.stdout, __name__)