You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmegeneralhealthmanager.py

2781 lines
145 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import os
import sys
import time
import json
import uuid
import datetime
# Logging imports
import logging
from logging.handlers import RotatingFileHandler
# Networking imports
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
"%s/var/log/splunk/trackme_general_health_manager.log" % splunkhome,
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# import Splunk libs
from splunklib.searchcommands import (
dispatch,
GeneratingCommand,
Configuration,
Option,
validators,
)
# import trackme libs
from trackme_libs import (
trackme_reqinfo,
)
# import trackme libs
from trackme_libs import (
run_splunk_search,
trackme_manage_report_schedule,
trackme_report_update_enablement,
)
# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces
# import TrackMe get data libs
from trackme_libs_get_data import get_full_kv_collection
# import the collections dict
from collections_data import collections_dict
from collections_data import (
collections_list_dsm,
collections_list_flx,
collections_list_fqm,
collections_list_dhm,
collections_list_mhm,
collections_list_wlk,
collections_list_common,
)
# logging:
# To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition
# and rely on callers themselves
@Configuration(distributed=False)
class HealthTracker(GeneratingCommand):
@staticmethod
def safe_create_datetime(year, month, day, hour=0, minute=0, second=0, tzinfo=None):
"""
Safely create a datetime object, handling leap years.
If trying to create Feb 29 in a non-leap year, falls back to Feb 28.
Args:
year: Year
month: Month (1-12)
day: Day of month
hour: Hour (default 0)
minute: Minute (default 0)
second: Second (default 0)
tzinfo: Timezone info (default None)
Returns:
datetime.datetime object
"""
# Check if this is Feb 29 and the year is not a leap year
if month == 2 and day == 29:
# Check if year is a leap year
is_leap_year = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
if not is_leap_year:
# Fall back to Feb 28 for non-leap years
day = 28
logging.debug(f'Leap year adjustment: Feb 29 in non-leap year {year} adjusted to Feb 28')
return datetime.datetime(year, month, day, hour, minute, second, tzinfo=tzinfo)
def get_uuid(self):
"""
Function to return a unique uuid which is used to trace performance run_time of each subtask.
"""
return str(uuid.uuid4())
def get_ml_rules_collection(self, collection):
"""
Get all records from an ML rules collection.
:param collection: The collection to query.
:return: A list of records, a dictionary of records, a list of keys.
"""
collection_records = []
collection_records_dict = {}
count_to_process_list = []
end = False
skip_tracker = 0
while not end:
process_collection_records = collection.data.query(skip=skip_tracker)
if process_collection_records:
for item in process_collection_records:
collection_records.append(item)
collection_records_dict[item.get("_key")] = (
item # Add the entire item to the dictionary
)
count_to_process_list.append(item.get("_key"))
skip_tracker += 5000
else:
end = True
return collection_records, collection_records_dict, count_to_process_list
def remove_ml_model(
self,
component,
rest_url,
header,
ml_model_lookup_name,
instance_id=None,
task_name=None,
task_instance_id=None,
):
"""
Removes an orphan Machine Learning model from the collection.
:param component: The component name.
:param rest_url: The REST URL to use.
:param header: The header to use.
:param ml_model_lookup_name: The Machine Learning model lookup name.
:param instance_id: The instance ID for logging purposes.
:param task_name: The task name for logging purposes.
:param task_instance_id: The task instance ID for logging purposes.
:return: True if the model was removed successfully, otherwise False.
"""
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{component}", attempting to delete orphan Machine Learning lookup_name="{ml_model_lookup_name}"'
)
try:
response = requests.delete(
rest_url,
headers=header,
verify=False,
timeout=600,
)
if response.status_code not in (
200,
201,
204,
):
error_msg = f'failure to delete ML lookup_name="{ml_model_lookup_name}", url="{rest_url}", response.status_code="{response.status_code}", response.text="{response.text}"'
raise Exception(error_msg)
else:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="success", deleted lookup_name="{ml_model_lookup_name}" successfully'
)
return True
except Exception as e:
error_msg = f'failure to delete ML lookup_name="{ml_model_lookup_name}" with exception="{str(e)}"'
raise Exception(error_msg)
def reassign_ml_model(
self,
model_id,
rest_url,
header,
instance_id=None,
task_name=None,
task_instance_id=None,
):
"""
Reasign a Machine Learning model to the Splunk system user.
:param model_id: The model_id to reassign.
:param rest_url: The REST URL to use.
:param header: The header to use.
:param instance_id: The instance ID for logging purposes.
:param task_name: The task name for logging purposes.
:param task_instance_id: The task instance ID for logging purposes.
:return: True if the model was reassigned successfully, otherwise False.
"""
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting to re-assign model_id="{model_id}" to splunk-system-user'
)
acl_properties = {
"sharing": "user",
"owner": "splunk-system-user",
}
# proceed boolean
proceed = False
# before re-assigning, check if the model exist by running a GET request, if the status code is different from 2**, do not proceed and log an informational message instead
try:
response = requests.get(
f"{rest_url}",
headers=header,
verify=False,
timeout=600,
)
if response.status_code not in (200, 201, 204):
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, model_id="{model_id}" does not exist, it might have been re-assigned in the meantime, skipping re-assignment'
)
return False
else:
proceed = True
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, model_id="{model_id}" failed to retrieve model, exception="{str(e)}"'
)
if proceed:
try:
response = requests.post(
f"{rest_url}/acl",
headers=header,
data=acl_properties,
verify=False,
timeout=600,
)
if response.status_code not in (
200,
201,
204,
):
error_msg = f'failure to reassign model_id="{model_id}", url="{rest_url}", response.status_code="{response.status_code}", response.text="{response.text}"'
raise Exception(error_msg)
else:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="success", model_id="{model_id}" reassigned successfully'
)
return True
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="failure", model_id="{model_id}" reassigned failed, exception="{str(e)}"'
)
raise Exception(str(e))
def get_all_accounts(self, instance_id=None, task_name=None, task_instance_id=None):
"""
Update the configuration of any exising remote account, to ensure that the configuration is up to date.
:param instance_id: The instance ID for logging purposes.
:param task_name: The task name for logging purposes.
:param task_instance_id: The task instance ID for logging purposes.
:return: A list of remote accounts.
"""
# endpoint target
url = f"{self._metadata.searchinfo.splunkd_uri}/servicesNS/nobody/trackme/trackme_account"
# current_remote_accounts_list
current_remote_accounts_list = []
# first, get the list of remote accounts
try:
response = requests.get(
url,
headers={
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
"Content-Type": "application/json",
},
verify=False,
params={
"output_mode": "json",
"count": -1,
},
timeout=600,
)
response.raise_for_status()
response_json = response.json()
# The list of remote accounts is stored as a list in entry
remote_accounts = response_json.get("entry", [])
# iterate through the remote accounts, adding them to the dict, name is the key, then we care about "content" which is a dict of our parameters
# for this account
for remote_account in remote_accounts:
remote_account_name = remote_account.get("name", None)
# add to list
current_remote_accounts_list.append(remote_account_name)
return current_remote_accounts_list
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, error while fetching remote account list: {str(e)}'
)
return []
#
# Main
#
def generate(self, **kwargs):
if self:
# performance counter
global_start = time.time()
# set instance_id
instance_id = self.get_uuid()
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
)
log.setLevel(reqinfo["logging_level"])
# Splunk header for REST requests
header = {
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
"Content-Type": "application/json",
}
logging.info( # First log message
f'context="general_execution", trackmegeneralhealthmanager is starting now.'
)
# global_results_dict to store results of the execution
global_results_dict = {}
# Register the object summary in the vtenant collection
collection_vtenants_name = "kv_trackme_virtual_tenants"
collection_vtenants = self.service.kvstore[collection_vtenants_name]
# get all vtenants records, this job is not tenant specific
vtenant_records = collection_vtenants.data.query()
############################################################
# Machine Learning related global health manager tasks
# Goals:
# - Inspect all ML collections, identify orphans models,
# and reassign if necessary
############################################################
# Reassignment: Ensures that all ML models are owned by splunk-system-user, amd re-assign otherwise
# run the following search to retrieve the list of existing ML models
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "mlmodels-management:splunk-system-user_reassignment"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# counters
ml_models_reassigned_success_count = 0
ml_models_reassigned_failures_count = 0
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting verification of ML models ownership and reassignment if necessary'
)
# Define the query
search = f'| rest splunk_server=local timeout=1200 "/servicesNS/nobody/trackme/data/lookup-table-files" | search eai:acl.app="trackme" AND title="__mlspl_model_*.mlmodel" | table title, id'
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
# A list to store current ml models (filename)
ml_models_for_reassignement_current_list = []
# A dict to store the existing models
ml_models_for_reassignement_dict_existing = {}
try:
reader = run_splunk_search(
self.service,
search,
kwargs_oneshot,
24,
5,
)
for item in reader:
if isinstance(item, dict):
ml_models_for_reassignement_current_list.append(
item.get("title")
) # this is the model filename
ml_models_for_reassignement_dict_existing[item.get("title")] = {
"id": item.get("id")
}
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the list of ML models, exception="{str(e)}"'
)
# Loop
for model_id in ml_models_for_reassignement_current_list:
# reassign the model
rest_url = ml_models_for_reassignement_dict_existing[model_id].get("id")
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting reassignment of model_id={model_id}"'
)
try:
reassigned_model = self.reassign_ml_model(
model_id,
rest_url,
header,
instance_id,
task_name,
task_instance_id,
)
if reassigned_model:
ml_models_reassigned_success_count += 1
except Exception as e:
ml_models_reassigned_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to reassign the model, model_id="{model_id}", exception="{str(e)}"'
)
############################################################
# Identify ML models owned by splunk-system-user
############################################################
# run the following search to retrieve the list of existing ML models
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting verification of ML models ownership and reassignment if necessary'
)
# Define the query
search = f'| rest splunk_server=local timeout=1200 "/servicesNS/splunk-system-user/trackme/data/lookup-table-files" | search eai:acl.app="trackme" AND title="__mlspl_model_*.mlmodel" | table title, id'
kwargs_oneshot = {
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
# A list to store current ml models (filename)
ml_models_current_list = []
# A dict to store the existing models
ml_models_dict_existing = {}
try:
reader = run_splunk_search(
self.service,
search,
kwargs_oneshot,
24,
5,
)
for item in reader:
if isinstance(item, dict):
ml_models_current_list.append(
item.get("title")
) # this is the model filename
ml_models_dict_existing[item.get("title")] = {
"id": item.get("id")
}
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the list of ML models, exception="{str(e)}"'
)
############################################################
# Identify ML models configured in TrackMe
############################################################
# A list to store ml_rules_outliers_collections
ml_rules_outliers_collections = []
# A dict to ml models definitions
ml_models_dict = {}
# A list to store ml models currently configured
ml_models_list = []
for vtenant_record in vtenant_records:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing vtenant_record={json.dumps(vtenant_record, indent=2)}'
)
# get the tenant_id
tenant_id = vtenant_record.get("tenant_id")
# for component in dsm, dhm, flx, fqm, wlk
for component in ["dsm", "dhm", "flx", "fqm", "wlk"]:
# get status
component_status = vtenant_record.get(f"tenant_{component}_enabled")
# append the collection
if component_status == 1:
ml_rules_outliers_collections.append(
f"kv_trackme_{component}_outliers_entity_rules_tenant_{tenant_id}"
)
# for each outliers rules collection
for ml_rules_collection_name in ml_rules_outliers_collections:
# connect to the collection service and retrieve the records
ml_rules_collection = self.service.kvstore[ml_rules_collection_name]
# extract ml_rules_tenant_id from the collection name: trackme_<component>_outliers_entity_rules_tenant_<ml_rules_tenant_id>
ml_rules_tenant_id = ml_rules_collection_name.split("_")[-1]
# get records
try:
ml_rules_records, ml_rules_records_dict, ml_rules_records_count = (
self.get_ml_rules_collection(ml_rules_collection)
)
for ml_rules_record in ml_rules_records:
# get key
ml_rules_record_key = ml_rules_record.get("_key")
# get dictionnary entities_outliers from the field entities_outliers
entities_outliers = json.loads(
ml_rules_record.get("entities_outliers")
)
# loop trough entities_outliers, the dict key is the model_id
for ml_model_entity in entities_outliers:
ml_models_dict[ml_model_entity] = {
"model_id": ml_model_entity,
"collection_name": ml_rules_collection_name,
"collection_key": ml_rules_record_key,
"tenant_id": ml_rules_tenant_id,
}
ml_models_list.append(
f"__mlspl_{ml_model_entity}.mlmodel"
) # this is the filename
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the records from the collection, collection_name="{ml_rules_collection_name}", exception="{str(e)}"'
)
# log
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {len(ml_models_dict)} ML models were found configured in TrackMe collections, will now start inspecting Splunk existing models.'
)
# log the number of currently existing models
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {len(ml_models_current_list)} ML models were found in the system, starting orphan models inspection'
)
#
# orphan models purge / reassign
#
ml_models_purged_success_count = 0
ml_models_purged_failures_count = 0
# for each model in ml_models_current_list, if the model is not in ml_models_list, delete it
for model_id in ml_models_current_list:
if model_id not in ml_models_list and not model_id == "pending":
# remove the model
rest_url = ml_models_dict_existing[model_id].get("id")
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting removal of model_id={model_id}"'
)
try:
self.remove_ml_model(
"trackme",
rest_url,
header,
model_id,
instance_id,
task_name,
task_instance_id,
)
ml_models_purged_success_count += 1
except Exception as e:
ml_models_purged_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to remove the orphan model, model_id="{model_id}", exception="{str(e)}"'
)
# end context="mlmodels-management"
# log
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {ml_models_purged_success_count} orphan ML models were removed, {ml_models_purged_failures_count} orphan ML models removals failed, {ml_models_reassigned_success_count} ML models were reassigned to splunk-system-user, {ml_models_reassigned_failures_count} ML models reassignments failed'
)
# add to results
global_results_dict["mlmodels_management"] = {
"ml_models_in_system_count": len(ml_models_current_list),
"ml_models_configured_count": len(ml_models_list),
"ml_models_purged_success_count": ml_models_purged_success_count,
"ml_models_purged_failures_count": ml_models_purged_failures_count,
"ml_models_reassigned_success_count": ml_models_reassigned_success_count,
"ml_models_reassigned_failures_count": ml_models_reassigned_failures_count,
"result": f"{ml_models_purged_success_count} orphan ML models were removed, {ml_models_purged_failures_count} orphan ML models removals failed, {ml_models_reassigned_success_count} ML models were reassigned to splunk-system-user, {ml_models_reassigned_failures_count} ML models reassignments failed",
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# End Machine Learning related global health manager tasks
############################################################
############################################################
# Splunk Remote Accounts maintenance
# Goals:
# - Calls the associated REST endpoint for each existing account,
# to verify, update account parameters if needed, and peform tokens
# rotation if needed
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "splunk-remote-accounts:verify_and_maintain_accounts"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, context="splunk-remote-accounts", starting verification and maintenance of Splunk remote accounts'
)
# get all accounts
current_remote_accounts_list = self.get_all_accounts(
instance_id, task_name, task_instance_id
)
# remote_accounts_maintenance_dict
remote_accounts_maintenance_dict = {}
# Loop through accounts, and call the endpoint
for account in current_remote_accounts_list:
url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/configuration/admin/maintain_remote_account"
try:
response = requests.post(
url,
headers={
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
"Content-Type": "application/json",
},
verify=False,
data=json.dumps(
{
"accounts": account,
}
),
timeout=600,
)
response.raise_for_status()
response_json = response.json()
remote_accounts_maintenance_dict[account] = response_json
except Exception as e:
error_msg = f'error calling endpoint, exception="{str(e)}"'
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {error_msg}'
)
remote_accounts_maintenance_dict[account] = error_msg
# add to global_results_dict, if the dict is empty, add a message to the global_results_dict as we had no actions to perform
if not remote_accounts_maintenance_dict:
global_results_dict[f"{task_name}"] = {
"message": "No actions to perform."
}
else:
global_results_dict[f"{task_name}"] = (
remote_accounts_maintenance_dict
)
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# TrackMe Virtual Tenants auto-repair
# Goals:
# - For each enable Virtual Tenant, verify that all expected
# are effectively available in the system. (KV collections...)
# - If for some reasons an expected object is missing,
# auto-repair will attempt to create it and fix the tenant inconsistency.
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "virtual_tenants:auto-repair:collections_and_transforms"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# init auto_repair_actions_list
auto_repair_actions_list = []
# A dict to store objects that were verified and their status, per tenant_id as the key
tenants_objects_status_dict = {}
# collections map per component, including common collections
collections_map_per_component = {
"dsm": collections_list_dsm,
"dhm": collections_list_dhm,
"mhm": collections_list_mhm,
"flx": collections_list_flx,
"fqm": collections_list_fqm,
"wlk": collections_list_wlk,
"common": collections_list_common, # Add common collections
}
for vtenant_record in vtenant_records:
# get the tenant_id
tenant_id = vtenant_record.get("tenant_id")
# check if tenant is a replica tenant, if so, skip it
if vtenant_record.get("tenant_replica", 0) == 1:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
)
continue
# init tenant_id_checked_status_dict
tenant_id_checked_status_dict = {}
# get RBAC
tenant_owner = str(vtenant_record.get("tenant_owner"))
tenant_roles_admin = str(vtenant_record.get("tenant_roles_admin"))
tenant_roles_user = str(vtenant_record.get("tenant_roles_user"))
tenant_roles_power = str(vtenant_record.get("tenant_roles_power"))
# TrackMe sharing level
trackme_default_sharing = reqinfo["trackme_conf"]["trackme_general"][
"trackme_default_sharing"
]
# for read permissions, concatenate admin, power and user
tenant_roles_read_perms = (
f"{tenant_roles_admin},{tenant_roles_power},{tenant_roles_user}"
)
# for write permissions, concatenate admin, power
tenant_roles_write_perms = f"{tenant_roles_admin},{tenant_roles_power}"
# for component in dsm, dhm, flx, fqm, wlk and common
for component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk", "common"]:
# get status
try:
component_status = int(
vtenant_record.get(
f"tenant_{component}_enabled", 1
) # Default to 1 for common
)
except Exception as e:
component_status = 0
# only continue if component is enabled
if component_status == 1:
# Handle collections
for object_name in collections_map_per_component[component]:
#
# Verify that the KV collection exists
#
kvstore_collection_name = (
f"kv_{object_name}_tenant_{tenant_id}"
)
kvstore_collection_exists = (
True # assume the collection exists
)
# check if the collection exists
try:
collection = self.service.kvstore[
kvstore_collection_name
]
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}'
)
tenant_id_checked_status_dict[
kvstore_collection_name
] = {
"result": "success",
"type": "kvstore_collection",
}
except Exception as e:
kvstore_collection_exists = False
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, failed to retrieve the collection, collection_name="{kvstore_collection_name}", exception="{str(e)}"'
)
tenant_id_checked_status_dict[
kvstore_collection_name
] = {
"result": "failure",
"exception": str(e),
"type": "kvstore_collection",
}
#
# Verify that the transform exists and contains the expected fields
#
transform_name = f"{object_name}_tenant_{tenant_id}"
transform_exists = True # assume the transform exists
transforms_fields_list_csv = None
transforms_fields_list = None
transforms_expected_fields_list_csv = collections_dict[
object_name
]
transforms_expected_fields_list = [
x.strip()
for x in transforms_expected_fields_list_csv.split(",")
]
transforms_has_missing_fields = False # assume False
# check if the transform exists
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, inspecting transform_name={transform_name}'
)
try:
transform = self.service.confs["transforms"][
transform_name
]
transforms_fields_list_csv = transform["fields_list"]
transforms_fields_list = (
[
x.strip()
for x in transforms_fields_list_csv.split(",")
]
if transforms_fields_list_csv
else []
)
# Verify that the transforms has at the minimum the expected fields
for expected_field in transforms_expected_fields_list:
if expected_field not in transforms_fields_list:
transforms_has_missing_fields = True
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_fields_list={transforms_fields_list}, transforms_has_missing_fields={transforms_has_missing_fields}'
)
except Exception as e:
transform_exists = False
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, failed to retrieve the transform, transform_name="{transform_name}", exception="{str(e)}"'
)
# temp logging
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, kvstore_collection_exists={kvstore_collection_exists}, transform_exists={transform_exists}, transform_fields_list={transforms_fields_list}, transforms_has_missing_fields={transforms_has_missing_fields}'
)
#
# Take action if needed
#
#
# KVstore collection
#
# If the KVstore collection does not exist, create it
if not kvstore_collection_exists:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, it will be created.'
)
# create the KVstore collection
url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/create_kvcollection'
data = {
"tenant_id": tenant_id,
"collection_name": kvstore_collection_name,
"collection_acl": {
"owner": tenant_owner,
"sharing": trackme_default_sharing,
"perms.write": tenant_roles_write_perms,
"perms.read": tenant_roles_read_perms,
},
"owner": tenant_owner,
}
try:
response = requests.post(
url,
headers={
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}"
},
data=json.dumps(data),
verify=False,
timeout=600,
)
response.raise_for_status()
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, it has been created successfully.'
)
# add to auto_repair_actions_list
auto_repair_actions_list.append(
{
"action": "create_kvcollection",
"tenant_id": tenant_id,
"component": component,
"collection_name": kvstore_collection_name,
"message": "KVstore collection was detected missing, it has been created successfully.",
}
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, attempt to create it has failed, exception="{str(e)}"'
)
# add to auto_repair_actions_list
auto_repair_actions_list.append(
{
"action": "create_kvcollection",
"tenant_id": tenant_id,
"component": component,
"collection_name": kvstore_collection_name,
"message": "KVstore collection was detected missing, attempt to create it has failed.",
"exception": str(e),
}
)
#
# Transforms definition: If the transforms does not exist, create it, if it exists but has missing fields, it will be deleted and recreated
#
if transform_exists and not transforms_has_missing_fields:
tenant_id_checked_status_dict[transform_name] = {
"result": "success",
"type": "transform",
}
elif not transform_exists:
tenant_id_checked_status_dict[transform_name] = {
"result": "failure",
"exception": "The transform was detected missing.",
"type": "transform",
}
elif transform_exists and transforms_has_missing_fields:
tenant_id_checked_status_dict[transform_name] = {
"result": "failure",
"exception": "The transform was detected as existing but has missing fields.",
"type": "transform",
}
else:
tenant_id_checked_status_dict[transform_name] = {
"result": "unknown",
"transform_exists": transform_exists,
"transforms_has_missing_fields": transforms_has_missing_fields,
"type": "transform",
}
if not transform_exists or transforms_has_missing_fields:
if not transform_exists:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it will be created.'
)
if transform_exists and transforms_has_missing_fields:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it will be recreated.'
)
#
# delete the transform
#
url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/delete_kvtransform'
data = {
"tenant_id": tenant_id,
"transform_name": transform_name,
}
try:
response = requests.post(
url,
headers={
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}"
},
data=json.dumps(data),
verify=False,
timeout=600,
)
response.raise_for_status()
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it has been deleted successfully.'
)
# add to auto_repair_actions_list
auto_repair_actions_list.append(
{
"action": "delete_kvtransform",
"tenant_id": tenant_id,
"component": component,
"transform_name": transform_name,
"message": "The transform was detected as existing but has missing fields, it has been deleted successfully.",
}
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it has been deleted successfully.'
)
# add to auto_repair_actions_list
auto_repair_actions_list.append(
{
"action": "delete_kvtransform",
"tenant_id": tenant_id,
"component": component,
"transform_name": transform_name,
"message": "The transform was detected as existing but has missing fields, attempt to delete it has failed.",
"exception": str(e),
}
)
#
# create the transform
#
url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/create_kvtransform'
data = {
"tenant_id": tenant_id,
"transform_name": transform_name,
"transform_fields": transforms_expected_fields_list_csv,
"collection_name": kvstore_collection_name,
"transform_acl": {
"owner": tenant_owner,
"sharing": trackme_default_sharing,
"perms.write": tenant_roles_write_perms,
"perms.read": tenant_roles_read_perms,
},
"owner": tenant_owner,
}
try:
response = requests.post(
url,
headers={
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}"
},
data=json.dumps(data),
verify=False,
timeout=600,
)
response.raise_for_status()
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it has been created successfully.'
)
# add to auto_repair_actions_list
auto_repair_actions_list.append(
{
"action": "create_kvtransform",
"tenant_id": tenant_id,
"component": component,
"transform_name": transform_name,
"message": "The transform was detected missing or inconsistent, it has been created successfully.",
}
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it has been created successfully.'
)
# add to auto_repair_actions_list
auto_repair_actions_list.append(
{
"action": "create_kvtransform",
"tenant_id": tenant_id,
"component": component,
"transform_name": transform_name,
"message": "The transform was detected missing or inconsistent, attempt to create it has failed.",
"exception": str(e),
}
)
# add to tenants_objects_status_dict
tenants_objects_status_dict[tenant_id] = tenant_id_checked_status_dict
# add to global_results_dict
global_results_dict[f"{task_name}"] = {
"knowledge_objects_status": tenants_objects_status_dict,
"auto_repair_actions_list": (
auto_repair_actions_list
if auto_repair_actions_list
else "No actions to perform."
),
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# TrackMe Virtual Tenants configuration issues fixer
# Goals:
# - Run a Splunk search to identify Virtual Tenants with configuration issues (missing reports)
# - For each tenant found, identify enabled components from the central KVstore collection
# - For each tenant/component combination, run the REST API call to fix issues
# - Exclude replica tenants (tenant_replica = 1)
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "virtual_tenants:auto-repair:components_reports"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# Step 1: Run Splunk search to identify tenants with missing reports
search = remove_leading_spaces(
"""
search (index=_internal sourcetype=trackme:rest_api) OR (index=_internal sourcetype=trackme:custom_commands:*)
log_level=error
task="optimize_tenant_scheduled_reports"
"failure to get report report_name"
"urlencoded"
| stats count by tenant_id
""")
kwargs_oneshot = {
"earliest_time": "-24h",
"latest_time": "now",
"output_mode": "json",
"count": 0,
}
# Counters
tenants_with_issues_found = 0
tenants_processed = 0
tenants_fixed = 0
tenants_skipped = 0
total_components_fixed = 0
total_components_failed = 0
# Lists to store detailed information
tenants_with_issues = []
tenants_processed_details = []
rest_call_responses = []
try:
reader = run_splunk_search(
self.service,
search,
kwargs_oneshot,
24,
5,
)
for item in reader:
if isinstance(item, dict):
tenant_id = item.get("tenant_id")
error_count = item.get("count", 0)
if tenant_id:
tenants_with_issues.append({
"tenant_id": tenant_id,
"error_count": error_count
})
tenants_with_issues_found += 1
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to run search for tenants with issues, exception="{str(e)}"'
)
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, found {tenants_with_issues_found} tenants with configuration issues'
)
# Step 2: Process each tenant found
for tenant_info in tenants_with_issues:
tenant_id = tenant_info["tenant_id"]
error_count = tenant_info["error_count"]
tenants_processed += 1
# Initialize tenant processing details
tenant_processing_detail = {
"tenant_id": tenant_id,
"error_count": error_count,
"enabled_components": [],
"components_fixed": 0,
"components_failed": 0,
"is_replica": False,
"skipped_reason": None,
"processing_status": "processing"
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing tenant {tenant_id} with {error_count} errors'
)
# Get tenant record from central KVstore collection
try:
# Find the tenant record in vtenant_records (already loaded)
tenant_record = None
for vtenant_record in vtenant_records:
if vtenant_record.get("tenant_id") == tenant_id:
tenant_record = vtenant_record
break
if not tenant_record:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, tenant record not found in central collection, skipping'
)
tenant_processing_detail["skipped_reason"] = "tenant record not found in central collection"
tenant_processing_detail["processing_status"] = "skipped"
tenants_processed_details.append(tenant_processing_detail)
tenants_skipped += 1
continue
# Check if tenant is a replica (exclude if so)
try:
tenant_replica = int(tenant_record.get("tenant_replica", 0))
except Exception as e:
tenant_replica = 0
tenant_processing_detail["is_replica"] = (tenant_replica == 1)
if tenant_replica == 1:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping'
)
tenant_processing_detail["skipped_reason"] = "replica tenant"
tenant_processing_detail["processing_status"] = "skipped"
tenants_processed_details.append(tenant_processing_detail)
tenants_skipped += 1
continue
# Get enabled components for this tenant
enabled_components = []
component_fields = {
"dsm": "tenant_dsm_enabled",
"dhm": "tenant_dhm_enabled",
"mhm": "tenant_mhm_enabled",
"flx": "tenant_flx_enabled",
"wlk": "tenant_wlk_enabled",
"fqm": "tenant_fqm_enabled"
}
for component, field_name in component_fields.items():
try:
if int(tenant_record.get(field_name, 0)) == 1:
enabled_components.append(component)
except (ValueError, TypeError):
continue
tenant_processing_detail["enabled_components"] = enabled_components
if not enabled_components:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, has no enabled components, skipping'
)
tenant_processing_detail["skipped_reason"] = "no enabled components"
tenant_processing_detail["processing_status"] = "skipped"
tenants_processed_details.append(tenant_processing_detail)
tenants_skipped += 1
continue
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, has enabled components: {enabled_components}'
)
# Step 3: Fix each enabled component
tenant_components_fixed = 0
tenant_components_failed = 0
for component in enabled_components:
try:
# Prepare the REST API call
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/check_component_tenant"
payload = {
"tenant_id": tenant_id,
"component_target": component,
"update_comment": f"Automated fix for missing reports - general health manager task"
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, fixing component {component}'
)
# Make the REST call
response = requests.post(
target_url,
headers=header,
data=json.dumps(payload),
verify=False,
timeout=600
)
# Store REST call response details
rest_call_response = {
"tenant_id": tenant_id,
"component": component,
"status_code": response.status_code,
"success": response.status_code == 200,
"response_text": response.text,
"timestamp": time.time()
}
# Try to parse JSON response if possible
try:
rest_call_response["response_json"] = response.json()
except:
rest_call_response["response_json"] = None
rest_call_responses.append(rest_call_response)
if response.status_code == 200:
tenant_components_fixed += 1
total_components_fixed += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, successfully fixed'
)
else:
tenant_components_failed += 1
total_components_failed += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, failed to fix: {response.status_code} - {response.text}'
)
except Exception as e:
tenant_components_failed += 1
total_components_failed += 1
# Store exception details
rest_call_response = {
"tenant_id": tenant_id,
"component": component,
"status_code": None,
"success": False,
"response_text": str(e),
"timestamp": time.time(),
"exception": True
}
rest_call_responses.append(rest_call_response)
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, exception during fix: {str(e)}'
)
if tenant_components_fixed > 0:
tenants_fixed += 1
# Update tenant processing details
tenant_processing_detail["components_fixed"] = tenant_components_fixed
tenant_processing_detail["components_failed"] = tenant_components_failed
tenant_processing_detail["processing_status"] = "completed"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, completed: {tenant_components_fixed} components fixed, {tenant_components_failed} components failed'
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, exception during processing: {str(e)}'
)
tenant_processing_detail["skipped_reason"] = f"exception during processing: {str(e)}"
tenant_processing_detail["processing_status"] = "error"
tenants_skipped += 1
# Always add the tenant processing detail to the list
tenants_processed_details.append(tenant_processing_detail)
# add to global_results_dict
global_results_dict[f"{task_name}"] = {
"tenants_with_issues_found": tenants_with_issues_found,
"tenants_processed": tenants_processed,
"tenants_fixed": tenants_fixed,
"tenants_skipped": tenants_skipped,
"total_components_fixed": total_components_fixed,
"total_components_failed": total_components_failed,
"tenants_with_issues": tenants_with_issues,
"tenants_processed_details": tenants_processed_details,
"rest_call_responses": rest_call_responses,
"result": f"{tenants_with_issues_found} tenants with issues found, {tenants_processed} processed, {tenants_fixed} fixed, {tenants_skipped} skipped, {total_components_fixed} components fixed, {total_components_failed} components failed",
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# TrackMe Virtual Tenants Check Health Tracker
# Goals:
# - If the tenant is enabled, then the health tracker should be enabled and scheduled.
# - If the tenant is disabled, then the health tracker should be disabled.
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "virtual_tenants:check_health_tracker"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
def manage_savedsearch_schedule(
tenant_id, savedsearch_names, feature_enabled, feature_name
):
"""
Helper function to manage saved search scheduling based on feature enablement.
Args:
savedsearch_names: List of saved search names to manage
feature_enabled: Boolean indicating if the feature should be enabled
feature_name: String name of the feature for logging purposes
"""
for savedsearch_name in savedsearch_names:
# get the status of the savedsearch
savedsearch_properties, savedsearch_acl = (
trackme_manage_report_schedule(
logging,
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
tenant_id,
savedsearch_name,
action="status",
)
)
# log
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", savedsearch_properties="{json.dumps(savedsearch_properties, indent=2)}", savedsearch_acl="{json.dumps(savedsearch_acl, indent=2)}"'
)
# get the disabled status
disabled = int(savedsearch_properties.get("disabled", 0))
# get the is_scheduled status
is_scheduled = int(savedsearch_properties.get("is_scheduled", 0))
# Check tenant status first - if tenant is disabled, ensure health tracker is disabled
if feature_enabled == False:
# Tenant is disabled - ensure health tracker is disabled (but keep it scheduled)
if disabled == 0:
# Report is enabled - disable it
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is disabled, disabling savedsearch.'
)
try:
trackme_report_update_enablement(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
tenant_id,
savedsearch_name,
"disable",
)
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch disabled successfully.'
)
return {
"action": "disable_savedsearch",
"tenant_id": tenant_id,
"savedsearch_name": savedsearch_name,
"message": "The savedsearch has been disabled successfully.",
}
except Exception as e:
logging.error(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to disable savedsearch, exception="{str(e)}"'
)
else:
# Report is already disabled - nothing to do
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is disabled and savedsearch is already disabled, nothing to do.'
)
return {
"action": "nothing_to_do",
"tenant_id": tenant_id,
"savedsearch_name": savedsearch_name,
"message": "Tenant is disabled and savedsearch is already disabled, nothing to do.",
}
# Tenant is enabled - ensure health tracker is enabled AND scheduled
elif feature_enabled == True:
# Track if we performed any actions
action_performed = False
action_message = ""
# Check if we need to enable the report
if disabled == 1:
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", the report is currently disabled and needs to be enabled.'
)
try:
trackme_report_update_enablement(
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
tenant_id,
savedsearch_name,
"enable",
)
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch enabled successfully.'
)
action_performed = True
action_message = "The savedsearch has been enabled successfully"
except Exception as e:
logging.error(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to enable the savedsearch, exception="{str(e)}"'
)
# stop here if we had an exception enabling the savedsearch
continue
# Check if we need to schedule the report
if is_scheduled == 0:
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", the report needs to be scheduled.'
)
try:
savedsearch_properties, savedsearch_acl = (
trackme_manage_report_schedule(
logging,
self._metadata.searchinfo.session_key,
self._metadata.searchinfo.splunkd_uri,
tenant_id,
savedsearch_name,
action="enable",
)
)
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch scheduled successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"'
)
action_performed = True
if action_message:
action_message += " and scheduled successfully."
else:
action_message = "The savedsearch has been scheduled successfully."
except Exception as e:
logging.error(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to schedule savedsearch, exception="{str(e)}"'
)
# Return appropriate result based on actions performed
if action_performed:
return {
"action": "enable_savedsearch",
"tenant_id": tenant_id,
"savedsearch_name": savedsearch_name,
"message": action_message,
}
else:
# Report is already enabled and scheduled - nothing to do
logging.info(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is enabled and savedsearch is already enabled and scheduled, nothing to do.'
)
return {
"action": "nothing_to_do",
"tenant_id": tenant_id,
"savedsearch_name": savedsearch_name,
"message": "Tenant is enabled and savedsearch is already enabled and scheduled, nothing to do.",
}
else:
# This should not happen as we've covered all cases above
logging.warning(
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", unexpected state, nothing to do.'
)
return {
"action": "nothing_to_do",
"tenant_id": tenant_id,
"savedsearch_name": savedsearch_name,
"message": "Unexpected state, nothing to do.",
}
# init auto_repair_actions_list
auto_repair_actions_list = []
# A dict to store objects that were verified and their status, per tenant_id as the key
tenants_objects_status_dict = {}
for vtenant_record in vtenant_records:
# get the tenant_id
tenant_id = vtenant_record.get("tenant_id")
# get the tenant_status (enabled/disabled)
tenant_status = vtenant_record.get("tenant_status", "enabled")
# check if tenant is a replica tenant, if so, skip it
if vtenant_record.get("tenant_replica", 0) == 1:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
)
continue
# init health_tracker_report_name
health_tracker_report_name = (
f"trackme_health_tracker_tenant_{tenant_id}"
)
health_tracker_check_result = {}
try:
# Determine if health tracker should be enabled based on tenant status
health_tracker_enabled = (tenant_status == "enabled")
health_tracker_check_result = manage_savedsearch_schedule(
tenant_id, [health_tracker_report_name], health_tracker_enabled, "health_tracker"
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
)
# add to tenants_objects_status_dict
tenants_objects_status_dict[tenant_id] = health_tracker_check_result
# add to global_results_dict
global_results_dict[f"{task_name}"] = {
"health_tracker_check_result": tenants_objects_status_dict,
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# TrackMe Virtual Tenants stateful alerts records expiration
# Goals:
# - For each enable Virtual Tenant, search for closed stateful alerts records
# in the KVstore collection, and delete them if they are older than 30 days.
# When purging statefule alerts records, search and purge associated charts records. (if any)
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "virtual_tenants:stateful_alerts_records_expiration"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# get the stateful records expiration days
stateful_records_expiration_days = int(
reqinfo["trackme_conf"]["trackme_general"][
"trackme_stateful_records_expiration_days"
]
)
# counters
expired_statefule_records_deleted_count = 0
expired_associated_charts_records_deleted_count = 0
orphans_charts_records_deleted_count = 0
for vtenant_record in vtenant_records:
# get the tenant_id
tenant_id = vtenant_record.get("tenant_id")
# check if tenant is a replica tenant, if so, skip it
if vtenant_record.get("tenant_replica", 0) == 1:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
)
continue
# Define the query
search = remove_leading_spaces(
f"""
| inputlookup trackme_stateful_alerting_tenant_{tenant_id} where alert_status="closed" | eval keyid=_key
| eval record_age=now()-ctime, is_expired=if(record_age > 86400*{stateful_records_expiration_days}, 1, 0)
| where is_expired=1
| table keyid, incident_id
"""
)
# A list to stored expired incident_id
expired_incident_id_list = []
# A list to store expired records
expired_records_list = []
# A list to store expired associated charts records
expired_associated_charts_records_list = []
# A list to store orphans charts records
orphans_charts_records = []
# Run the search
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
)
try:
reader = run_splunk_search(
self.service,
search,
{
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
},
24,
5,
)
for item in reader:
if isinstance(item, dict):
expired_records_list.append(item.get("keyid"))
expired_incident_id_list.append(item.get("incident_id"))
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, records for incident_id={item.get("incident_id")} have been detected as expired and will be deleted from the KVstore collections, keyid={item.get("keyid")}'
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of expired records, exception="{str(e)}"'
)
# If nothing to do, continue
if not expired_records_list:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no expired records to process, skipping.'
)
continue
else:
# Run a new search to retrieve the list of associated charts records
# Convert the list to a CSV string filtered
expired_records_filtered_csv = (
f"({','.join(expired_incident_id_list)})"
)
search = remove_leading_spaces(
f"""
| inputlookup trackme_stateful_alerting_charts_tenant_{tenant_id} where incident_id IN {expired_records_filtered_csv} | eval keyid=_key
| table keyid, incident_id
"""
)
# Run the search
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
)
try:
reader = run_splunk_search(
self.service,
search,
{
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
},
24,
5,
)
for item in reader:
if isinstance(item, dict):
expired_associated_charts_records_list.append(
item.get("keyid")
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of associated charts records, exception="{str(e)}"'
)
# Run a new search to retrieve the list of orphans charts records
search = remove_leading_spaces(
f"""
| inputlookup trackme_stateful_alerting_charts_tenant_{tenant_id} | eval keyid=_key
| lookup trackme_stateful_alerting_tenant_{tenant_id} incident_id AS incident_id OUTPUT incident_id as parent_incident_id
| where (isnull(parent_incident_id) OR parent_incident_id="")
| table keyid, incident_id
"""
)
# Run the search
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
)
try:
reader = run_splunk_search(
self.service,
search,
{
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
},
24,
5,
)
for item in reader:
if isinstance(item, dict):
orphans_charts_records.append(item.get("keyid"))
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of orphans charts records, exception="{str(e)}"'
)
# Purge expired records from the stateful collection, if any
if expired_records_list:
# connect to the collection
collection_stateful_alerting_name = (
f"kv_trackme_stateful_alerting_tenant_{tenant_id}"
)
collection_stateful_alerting = self.service.kvstore[
collection_stateful_alerting_name
]
# for each expired record, delete the record from the stateful collection
for expired_record in expired_records_list:
try:
# Remove the record
collection_stateful_alerting.data.delete(
json.dumps({"_key": expired_record})
)
expired_statefule_records_deleted_count += 1
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"'
)
# Purge expired associated charts records from the stateful collection, if any
if expired_associated_charts_records_list:
# connect to the collection
collection_stateful_alerting_name = (
f"kv_trackme_stateful_alerting_charts_tenant__{tenant_id}"
)
# for each expired record, delete the record from the stateful collection
for expired_record in expired_associated_charts_records_list:
try:
# Remove the record
collection_stateful_alerting.data.delete(
json.dumps({"_key": expired_record})
)
expired_associated_charts_records_deleted_count += 1
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"'
)
# Purge orphans charts records from the stateful collection, if any
if orphans_charts_records:
# connect to the collection
collection_stateful_alerting_name = (
f"kv_trackme_stateful_alerting_charts_tenant__{tenant_id}"
)
# for each expired record, delete the record from the stateful collection
for orphan_record in orphans_charts_records:
try:
# Remove the record
collection_stateful_alerting.data.delete(
json.dumps({"_key": orphan_record})
)
expired_associated_charts_records_deleted_count += 1
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"'
)
# add to global_results_dict
global_results_dict[
f"{task_name}"
] = {
"expired_statefule_records_deleted_count": expired_statefule_records_deleted_count,
"expired_associated_charts_records_deleted_count": expired_associated_charts_records_deleted_count,
"orphans_charts_records_deleted_count": orphans_charts_records_deleted_count,
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# TrackMe Virtual Tenants stateful alerts duplicate opened incidents cleanup
# Goals:
# - For each enabled Virtual Tenant, verify that for a given object_id,
# there should not be more than one opened incident (alert_status="opened") in the KVstore
# - If there are more than one incident_id for the same object_id, keep only the latest
# (based on the field mtime which is the epochtime of the last modification of the incident_id),
# other records should be updated with alert_status="closed"
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = (
"virtual_tenants:stateful_alerts_duplicate_opened_incidents_cleanup"
)
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# counters
duplicate_opened_incidents_found_count = 0
duplicate_opened_incidents_resolved_count = 0
duplicate_opened_incidents_resolution_failures_count = 0
for vtenant_record in vtenant_records:
# get the tenant_id
tenant_id = vtenant_record.get("tenant_id")
# check if tenant is a replica tenant, if so, skip it
if vtenant_record.get("tenant_replica", 0) == 1:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
)
continue
# Define the query to find duplicate opened incidents for the same object_id
search = remove_leading_spaces(
f"""
| inputlookup trackme_stateful_alerting_tenant_{tenant_id} where alert_status="opened" | eval keyid=_key
| eval _time=mtime
| stats count as incident_count, values(incident_id) as incident_ids, latest(incident_id) as latest_incident_id, latest(keyid) as latest_keyid, values(keyid) as keyids, max(mtime) as max_mtime by object_id
| where incident_count > 1
| eval to_close_keyids=mvmap(keyids, if(mvfind(keyids, "^\\\\"" + latest_keyid + "\\$")=0, null(), keyids))
| eval to_close_incident_ids=mvmap(incident_ids, if(mvfind(incident_ids, "^\\\\"" + latest_incident_id + "\\$")=0, null(), incident_ids))
| table object_id, incident_count, latest_incident_id, latest_keyid, max_mtime, to_close_keyids, to_close_incident_ids
"""
)
# A list to store duplicate opened incidents data
duplicate_opened_incidents_list = []
# Run the search
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
)
try:
reader = run_splunk_search(
self.service,
search,
{
"earliest_time": "-5m",
"latest_time": "now",
"output_mode": "json",
"count": 0,
},
24,
5,
)
for item in reader:
if isinstance(item, dict):
duplicate_opened_incidents_list.append(item)
duplicate_opened_incidents_found_count += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, found duplicate opened incidents for object_id={item.get("object_id")}, incident_count={item.get("incident_count")}, latest_incident_id={item.get("latest_incident_id")}, latest_keyid={item.get("latest_keyid")}, max_mtime={item.get("max_mtime")}'
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of duplicate opened incidents, exception="{str(e)}"'
)
# If nothing to do, continue
if not duplicate_opened_incidents_list:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no duplicate opened incidents to process, skipping.'
)
continue
# Process duplicate opened incidents
for duplicate_incident in duplicate_opened_incidents_list:
object_id = duplicate_incident.get("object_id")
to_close_keyids = duplicate_incident.get("to_close_keyids", [])
# if not a list, turn into a list csv
if not isinstance(to_close_keyids, list):
to_close_keyids = to_close_keyids.split(",")
to_close_incident_ids = duplicate_incident.get(
"to_close_incident_ids", []
)
# if not a list, turn into a list csv
if not isinstance(to_close_incident_ids, list):
to_close_incident_ids = to_close_incident_ids.split(",")
# Parse the to_close_keyids and to_close_incident_ids
if to_close_keyids:
to_close_keyids_list = [
keyid.strip()
for keyid in to_close_keyids
if keyid.strip()
]
else:
to_close_keyids_list = []
if to_close_incident_ids:
to_close_incident_ids_list = [
incident_id.strip()
for incident_id in to_close_incident_ids
if incident_id.strip()
]
else:
to_close_incident_ids_list = []
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, will close {len(to_close_keyids_list)} duplicate incidents: keyids={to_close_keyids_list}, incident_ids={to_close_incident_ids_list}'
)
# Connect to the collection
collection_stateful_alerting_name = (
f"kv_trackme_stateful_alerting_tenant_{tenant_id}"
)
collection_stateful_alerting = self.service.kvstore[
collection_stateful_alerting_name
]
# Update each duplicate incident to closed status
for keyid in to_close_keyids_list:
try:
# Get the current record
record_list = collection_stateful_alerting.data.query(
query=json.dumps({"_key": keyid})
)
if record_list and len(record_list) > 0:
# Extract the first (and should be only) record from the list
record = record_list[0]
# Update the record to closed status
record["alert_status"] = "closed"
record["mtime"] = int(
time.time()
) # Update modification time
# Update the record in the collection
collection_stateful_alerting.data.update(
keyid, json.dumps(record)
)
duplicate_opened_incidents_resolved_count += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, successfully closed duplicate incident with keyid={keyid}'
)
else:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, record with keyid={keyid} not found in collection'
)
except Exception as e:
duplicate_opened_incidents_resolution_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, failed to close duplicate incident with keyid={keyid}, exception="{str(e)}"'
)
# add to global_results_dict
global_results_dict[
f"{task_name}"
] = {
"duplicate_opened_incidents_found_count": duplicate_opened_incidents_found_count,
"duplicate_opened_incidents_resolved_count": duplicate_opened_incidents_resolved_count,
"duplicate_opened_incidents_resolution_failures_count": duplicate_opened_incidents_resolution_failures_count,
"result": f"{duplicate_opened_incidents_found_count} duplicate opened incidents found, {duplicate_opened_incidents_resolved_count} resolved successfully, {duplicate_opened_incidents_resolution_failures_count} resolution failures",
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# TrackMe Virtual Tenants stateful charts records expiration
# Goals:
# - For each Virtual tenant, purge any record in the stateful charts KVstore collection:
# trackme_stateful_alerting_charts_tenant_<tenant_id>
# - For each KVrecord which is equal or older to 48 hours, based on the field "ctime"
# of the record which contains the epochtime of its creation
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "virtual_tenants:stateful_charts_records_expiration"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# get the stateful records expiration days
stateful_records_expiration_days = int(
reqinfo["trackme_conf"]["trackme_general"][
"trackme_stateful_charts_records_expiration_days"
]
)
# Define the expiration threshold (based on the expiration days)
charts_records_expiration_seconds = stateful_records_expiration_days * 24 * 3600
current_time = time.time()
# counters
expired_charts_records_deleted_count = 0
expired_charts_records_deletion_failures_count = 0
for vtenant_record in vtenant_records:
# get the tenant_id
tenant_id = vtenant_record.get("tenant_id")
# check if tenant is a replica tenant, if so, skip it
if vtenant_record.get("tenant_replica", 0) == 1:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
)
continue
# connect to the stateful charts collection
collection_stateful_charts_name = (
f"kv_trackme_stateful_alerting_charts_tenant_{tenant_id}"
)
try:
collection_stateful_charts = self.service.kvstore[collection_stateful_charts_name]
# get all records from the collection
(
charts_records,
charts_collection_keys,
charts_collection_dict,
) = get_full_kv_collection(
collection_stateful_charts, collection_stateful_charts_name
)
# A list to store expired records to delete
expired_charts_records_list = []
# Process each record to check if it's older than 48 hours
for record in charts_records:
try:
# Get the ctime field and convert from string to float if needed
ctime_str = record.get("ctime")
if ctime_str is None:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, record with key={record.get("_key")} has no ctime field, skipping.'
)
continue
# Convert string to float if needed
if isinstance(ctime_str, str):
ctime_float = float(ctime_str)
else:
ctime_float = float(ctime_str)
# Calculate age in seconds
record_age_seconds = current_time - ctime_float
# Check if record is older than 48 hours
if record_age_seconds >= charts_records_expiration_seconds:
expired_charts_records_list.append(record.get("_key"))
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, record with key={record.get("_key")} is {round(record_age_seconds/3600, 2)} hours old (>= 48 hours), will be deleted.'
)
except (ValueError, TypeError) as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to process ctime field for record with key={record.get("_key")}, ctime="{ctime_str}", exception="{str(e)}"'
)
continue
# If no expired records, continue to next tenant
if not expired_charts_records_list:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no expired charts records to process, skipping.'
)
continue
# Delete expired records from the collection
for expired_record_key in expired_charts_records_list:
try:
# Remove the record
collection_stateful_charts.data.delete(
json.dumps({"_key": expired_record_key})
)
expired_charts_records_deleted_count += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, successfully deleted expired charts record with key={expired_record_key}'
)
except Exception as e:
expired_charts_records_deletion_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete expired charts record with key={expired_record_key}, exception="{str(e)}"'
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to access collection="{collection_stateful_charts_name}", exception="{str(e)}"'
)
# add to global_results_dict
global_results_dict[f"{task_name}"] = {
"expired_charts_records_deleted_count": expired_charts_records_deleted_count,
"expired_charts_records_deletion_failures_count": expired_charts_records_deletion_failures_count,
"result": f"{expired_charts_records_deleted_count} expired charts records deleted, {expired_charts_records_deletion_failures_count} deletion failures",
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
############################################################
# Recurring Bank Holidays Management
# Goals:
# - Process recurring bank holidays and create future occurrences
# - Handle holidays that span across years (e.g., Dec 31 - Jan 1)
# - Clean up past bank holiday periods that have already ended
############################################################
task_start = time.time()
task_instance_id = self.get_uuid()
task_name = "bank-holidays:recurring_periods_management"
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
)
# counters
recurring_holidays_processed_count = 0
new_periods_created_count = 0
periods_creation_failures_count = 0
past_periods_deleted_count = 0
past_periods_deletion_failures_count = 0
try:
# Connect to bank holidays collection
collection_name = "kv_trackme_bank_holidays"
collection = self.service.kvstore[collection_name]
# Get current time and year
current_time = time.time()
current_year = datetime.datetime.fromtimestamp(current_time, tz=datetime.timezone.utc).year
############################################################
# Step 1: Clean up past bank holiday periods
# Strategy:
# - For recurring holidays: Keep the oldest one for each pattern (template), delete other past duplicates
# - For non-recurring holidays: Delete all that are past
############################################################
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting cleanup of past bank holiday periods.'
)
try:
# Get all bank holidays
all_holidays = collection.data.query()
# Group recurring holidays by pattern to identify templates
recurring_by_pattern = {}
non_recurring_past = []
for holiday in all_holidays:
holiday_dict = dict(holiday)
holiday_key = holiday_dict.get("_key")
end_date_epoch = holiday_dict.get("end_date")
is_recurring = holiday_dict.get("is_recurring", False)
if not end_date_epoch:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, holiday with key={holiday_key} has no end_date, skipping cleanup check.'
)
continue
# Check if period has already passed
if int(end_date_epoch) < int(current_time):
if is_recurring:
# For recurring holidays, group by pattern to keep templates
period_name = holiday_dict.get("period_name", "")
country_code = holiday_dict.get("country_code", "")
start_date_epoch = holiday_dict.get("start_date")
if start_date_epoch:
try:
start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc)
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
# Create pattern key
pattern_key = f"{period_name}|{country_code}|{start_dt.month:02d}-{start_dt.day:02d}|{end_dt.month:02d}-{end_dt.day:02d}"
if pattern_key not in recurring_by_pattern:
recurring_by_pattern[pattern_key] = []
recurring_by_pattern[pattern_key].append({
"key": holiday_key,
"time_created": holiday_dict.get("time_created", 0),
"holiday": holiday_dict
})
except Exception as e:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to parse dates for holiday key={holiday_key}, exception="{str(e)}", treating as non-recurring for cleanup.'
)
non_recurring_past.append(holiday_dict)
else:
# Missing start_date, treat as non-recurring
non_recurring_past.append(holiday_dict)
else:
# Non-recurring past holiday - mark for deletion
non_recurring_past.append(holiday_dict)
# Delete non-recurring past holidays via REST API
for holiday_dict in non_recurring_past:
holiday_key = holiday_dict.get("_key")
period_name = holiday_dict.get("period_name", "unknown")
try:
end_date_epoch = holiday_dict.get("end_date")
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
end_date_str = end_dt.strftime("%Y-%m-%d %H:%M:%S")
# Use REST API endpoint for deletion (enables auditing)
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/delete"
payload = {"_key": holiday_key}
response = requests.post(
target_url,
headers=header,
data=json.dumps(payload),
verify=False,
timeout=600
)
if response.status_code == 200:
past_periods_deleted_count += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, deleted past non-recurring bank holiday via REST API: key={holiday_key}, period_name="{period_name}", end_date={end_date_str}'
)
else:
past_periods_deletion_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete past non-recurring bank holiday via REST API: key={holiday_key}, period_name="{period_name}", status_code={response.status_code}, response={response.text}'
)
except Exception as e:
past_periods_deletion_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete past non-recurring bank holiday: key={holiday_key}, period_name="{period_name}", exception="{str(e)}"'
)
# For recurring holidays, keep the oldest one (template) for each pattern, delete others
for pattern_key, holidays_list in recurring_by_pattern.items():
if len(holidays_list) > 1:
# Sort by time_created (oldest first) - keep the first one as template
holidays_list.sort(key=lambda x: x.get("time_created", 0))
template = holidays_list[0]
duplicates = holidays_list[1:]
logging.debug(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, pattern="{pattern_key}" has {len(holidays_list)} past occurrences, keeping template (key={template["key"]}), deleting {len(duplicates)} duplicates.'
)
# Delete duplicate past periods via REST API
for duplicate in duplicates:
duplicate_key = duplicate["key"]
duplicate_holiday = duplicate["holiday"]
period_name = duplicate_holiday.get("period_name", "unknown")
try:
end_date_epoch = duplicate_holiday.get("end_date")
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
end_date_str = end_dt.strftime("%Y-%m-%d %H:%M:%S")
# Use REST API endpoint for deletion (enables auditing)
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/delete"
payload = {"_key": duplicate_key}
response = requests.post(
target_url,
headers=header,
data=json.dumps(payload),
verify=False,
timeout=600
)
if response.status_code == 200:
past_periods_deleted_count += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, deleted duplicate past recurring bank holiday via REST API: key={duplicate_key}, period_name="{period_name}", pattern="{pattern_key}", end_date={end_date_str}'
)
else:
past_periods_deletion_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete duplicate past recurring bank holiday via REST API: key={duplicate_key}, period_name="{period_name}", status_code={response.status_code}, response={response.text}'
)
except Exception as e:
past_periods_deletion_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete duplicate past recurring bank holiday: key={duplicate_key}, period_name="{period_name}", exception="{str(e)}"'
)
# If only one past occurrence, keep it as template (no deletion needed)
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, cleanup completed: {past_periods_deleted_count} past periods deleted, {past_periods_deletion_failures_count} deletion failures.'
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed during cleanup of past bank holidays, exception="{str(e)}"'
)
############################################################
# Step 2: Process recurring holidays and create future occurrences
# Ensure we have periods for current year + next year (year+1)
############################################################
# Get all recurring bank holidays (after cleanup)
query_recurring = json.dumps({"is_recurring": True})
recurring_holidays = collection.data.query(query=query_recurring)
if not recurring_holidays:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no recurring bank holidays found, skipping creation task.'
)
else:
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, found {len(recurring_holidays)} recurring bank holiday(s) to process.'
)
# Plan ahead for current year + next year (year+1)
# This ensures we always have periods for the current year (if not yet passed) and next year
years_to_check = [current_year, current_year + 1]
# Get all existing bank holidays to check for duplicates
all_existing_holidays = collection.data.query()
existing_periods_by_pattern = {}
# Group existing holidays by pattern (period_name + country_code + month/day)
for holiday in all_existing_holidays:
holiday_dict = dict(holiday)
period_name = holiday_dict.get("period_name", "")
country_code = holiday_dict.get("country_code", "")
start_date_epoch = holiday_dict.get("start_date")
end_date_epoch = holiday_dict.get("end_date")
if start_date_epoch and end_date_epoch:
start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc)
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
# Create a pattern key: period_name + country_code + month/day of start and end
pattern_key = f"{period_name}|{country_code}|{start_dt.month:02d}-{start_dt.day:02d}|{end_dt.month:02d}-{end_dt.day:02d}"
if pattern_key not in existing_periods_by_pattern:
existing_periods_by_pattern[pattern_key] = []
existing_periods_by_pattern[pattern_key].append({
"year": start_dt.year,
"record": holiday_dict
})
# Process each recurring holiday
for recurring_holiday in recurring_holidays:
recurring_holidays_processed_count += 1
holiday_dict = dict(recurring_holiday)
period_name = holiday_dict.get("period_name", "")
country_code = holiday_dict.get("country_code", "")
comment = holiday_dict.get("comment", "")
start_date_epoch = holiday_dict.get("start_date")
end_date_epoch = holiday_dict.get("end_date")
src_user = holiday_dict.get("src_user", "system")
if not start_date_epoch or not end_date_epoch:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recurring holiday with key={holiday_dict.get("_key")} has invalid dates, skipping.'
)
continue
# Parse original dates
try:
start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc)
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to parse dates for recurring holiday key={holiday_dict.get("_key")}, exception="{str(e)}"'
)
continue
# Extract month/day from original dates
start_month = start_dt.month
start_day = start_dt.day
start_hour = start_dt.hour
start_minute = start_dt.minute
end_month = end_dt.month
end_day = end_dt.day
end_hour = end_dt.hour
end_minute = end_dt.minute
# Create pattern key for this recurring holiday
pattern_key = f"{period_name}|{country_code}|{start_month:02d}-{start_day:02d}|{end_month:02d}-{end_day:02d}"
# Check which years already have this holiday
# We check both by year and by actual date range to be more robust
existing_years = set()
existing_date_ranges = {} # year -> list of (start_epoch, end_epoch) tuples
if pattern_key in existing_periods_by_pattern:
for existing_period in existing_periods_by_pattern[pattern_key]:
year = existing_period["year"]
existing_years.add(year)
record = existing_period["record"]
if year not in existing_date_ranges:
existing_date_ranges[year] = []
existing_date_ranges[year].append((
record.get("start_date"),
record.get("end_date")
))
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing recurring holiday: period_name="{period_name}", pattern_key="{pattern_key}", existing_years={sorted(existing_years)}'
)
# Create periods for missing years
for target_year in years_to_check:
# Check if year already exists
if target_year in existing_years:
# Double-check by verifying the date range matches
# This handles edge cases where same year might have been created manually
should_skip = False
if target_year in existing_date_ranges:
# Calculate what the date range should be for this year
if end_month < start_month or (end_month == start_month and end_day < start_day):
# Year-spanning: target_year to target_year+1
expected_start = self.safe_create_datetime(
target_year, start_month, start_day, start_hour, start_minute,
tzinfo=datetime.timezone.utc
).timestamp()
expected_end = self.safe_create_datetime(
target_year + 1, end_month, end_day, end_hour, end_minute,
tzinfo=datetime.timezone.utc
).timestamp()
else:
# Normal: both in target_year
expected_start = self.safe_create_datetime(
target_year, start_month, start_day, start_hour, start_minute,
tzinfo=datetime.timezone.utc
).timestamp()
expected_end = self.safe_create_datetime(
target_year, end_month, end_day, end_hour, end_minute,
tzinfo=datetime.timezone.utc
).timestamp()
# Check if any existing period matches this date range (within same day tolerance)
for existing_start, existing_end in existing_date_ranges[target_year]:
if existing_start and existing_end:
# Check if dates are on the same day (tolerance for time differences)
existing_start_dt = datetime.datetime.fromtimestamp(existing_start, tz=datetime.timezone.utc)
expected_start_dt = datetime.datetime.fromtimestamp(expected_start, tz=datetime.timezone.utc)
if (existing_start_dt.year == expected_start_dt.year and
existing_start_dt.month == expected_start_dt.month and
existing_start_dt.day == expected_start_dt.day):
should_skip = True
break
if should_skip:
logging.debug(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, period already exists for year={target_year}, period_name="{period_name}", skipping.'
)
continue
# Calculate start and end dates for target year
try:
# Handle year-spanning holidays (e.g., Dec 31 - Jan 1)
if end_month < start_month or (end_month == start_month and end_day < start_day):
# Holiday spans across years (e.g., Dec 31 - Jan 1)
# Start date is in target_year, end date is in target_year + 1
new_start_dt = self.safe_create_datetime(
target_year, start_month, start_day, start_hour, start_minute,
tzinfo=datetime.timezone.utc
)
new_end_dt = self.safe_create_datetime(
target_year + 1, end_month, end_day, end_hour, end_minute,
tzinfo=datetime.timezone.utc
)
else:
# Normal holiday within the same year
new_start_dt = self.safe_create_datetime(
target_year, start_month, start_day, start_hour, start_minute,
tzinfo=datetime.timezone.utc
)
new_end_dt = self.safe_create_datetime(
target_year, end_month, end_day, end_hour, end_minute,
tzinfo=datetime.timezone.utc
)
# Convert to epoch timestamps
new_start_epoch = int(round(new_start_dt.timestamp()))
new_end_epoch = int(round(new_end_dt.timestamp()))
# Validate date range
if new_end_epoch <= new_start_epoch:
logging.warning(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, invalid date range for year={target_year}, period_name="{period_name}", skipping.'
)
continue
# Create new record via REST API (enables auditing and delegates complexity)
try:
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/create"
payload = {
"period_name": period_name,
"start_date": new_start_epoch,
"end_date": new_end_epoch,
"comment": comment,
"country_code": country_code,
"is_recurring": True, # Keep recurring flag
}
response = requests.post(
target_url,
headers=header,
data=json.dumps(payload),
verify=False,
timeout=600
)
if response.status_code == 200:
response_data = response.json()
created_record = response_data.get("payload", {})
new_key = created_record.get("_key")
new_periods_created_count += 1
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, successfully created recurring bank holiday period via REST API: key={new_key}, period_name="{period_name}", year={target_year}, start_date={new_start_dt.strftime("%Y-%m-%d %H:%M")}, end_date={new_end_dt.strftime("%Y-%m-%d %H:%M")}'
)
elif response.status_code == 409:
# 409 Conflict means the period already exists (duplicate detection)
# This is expected behavior, not an error - log at debug level
logging.debug(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recurring bank holiday period already exists (duplicate detected): period_name="{period_name}", year={target_year}, status_code=409'
)
else:
periods_creation_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to create recurring bank holiday period via REST API for year={target_year}, period_name="{period_name}", status_code={response.status_code}, response={response.text}'
)
except Exception as e:
periods_creation_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to create recurring bank holiday period for year={target_year}, period_name="{period_name}", exception="{str(e)}"'
)
except Exception as e:
periods_creation_failures_count += 1
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to calculate dates for year={target_year}, period_name="{period_name}", exception="{str(e)}"'
)
except Exception as e:
logging.error(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to process recurring bank holidays, exception="{str(e)}"'
)
# add to global_results_dict
global_results_dict[f"{task_name}"] = {
"recurring_holidays_processed_count": recurring_holidays_processed_count,
"new_periods_created_count": new_periods_created_count,
"periods_creation_failures_count": periods_creation_failures_count,
"past_periods_deleted_count": past_periods_deleted_count,
"past_periods_deletion_failures_count": past_periods_deletion_failures_count,
"result": f"{recurring_holidays_processed_count} recurring holidays processed, {new_periods_created_count} new periods created, {periods_creation_failures_count} creation failures, {past_periods_deleted_count} past periods deleted, {past_periods_deletion_failures_count} deletion failures",
}
logging.info(
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
)
#
# End
#
# yield the results
yield_record = {
"_time": time.time(),
"_raw": global_results_dict,
"results": global_results_dict,
}
yield yield_record
#
# END
#
# end general task
logging.info(
f"instance_id={instance_id}, trackmegeneralhealthmanager has terminated, total_run_time={round(time.time() - global_start, 3)}"
)
dispatch(HealthTracker, sys.argv, sys.stdin, sys.stdout, __name__)