You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2781 lines
145 KiB
2781 lines
145 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import uuid
|
|
import datetime
|
|
|
|
# Logging imports
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Networking imports
|
|
import requests
|
|
import urllib3
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_general_health_manager.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
GeneratingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_reqinfo,
|
|
)
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
run_splunk_search,
|
|
trackme_manage_report_schedule,
|
|
trackme_report_update_enablement,
|
|
)
|
|
|
|
# import trackme libs utils
|
|
from trackme_libs_utils import remove_leading_spaces
|
|
|
|
# import TrackMe get data libs
|
|
from trackme_libs_get_data import get_full_kv_collection
|
|
|
|
# import the collections dict
|
|
from collections_data import collections_dict
|
|
from collections_data import (
|
|
collections_list_dsm,
|
|
collections_list_flx,
|
|
collections_list_fqm,
|
|
collections_list_dhm,
|
|
collections_list_mhm,
|
|
collections_list_wlk,
|
|
collections_list_common,
|
|
)
|
|
|
|
# logging:
|
|
# To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition
|
|
# and rely on callers themselves
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class HealthTracker(GeneratingCommand):
|
|
|
|
@staticmethod
|
|
def safe_create_datetime(year, month, day, hour=0, minute=0, second=0, tzinfo=None):
|
|
"""
|
|
Safely create a datetime object, handling leap years.
|
|
If trying to create Feb 29 in a non-leap year, falls back to Feb 28.
|
|
|
|
Args:
|
|
year: Year
|
|
month: Month (1-12)
|
|
day: Day of month
|
|
hour: Hour (default 0)
|
|
minute: Minute (default 0)
|
|
second: Second (default 0)
|
|
tzinfo: Timezone info (default None)
|
|
|
|
Returns:
|
|
datetime.datetime object
|
|
"""
|
|
# Check if this is Feb 29 and the year is not a leap year
|
|
if month == 2 and day == 29:
|
|
# Check if year is a leap year
|
|
is_leap_year = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
|
|
if not is_leap_year:
|
|
# Fall back to Feb 28 for non-leap years
|
|
day = 28
|
|
logging.debug(f'Leap year adjustment: Feb 29 in non-leap year {year} adjusted to Feb 28')
|
|
|
|
return datetime.datetime(year, month, day, hour, minute, second, tzinfo=tzinfo)
|
|
|
|
def get_uuid(self):
|
|
"""
|
|
Function to return a unique uuid which is used to trace performance run_time of each subtask.
|
|
"""
|
|
return str(uuid.uuid4())
|
|
|
|
def get_ml_rules_collection(self, collection):
|
|
"""
|
|
Get all records from an ML rules collection.
|
|
|
|
:param collection: The collection to query.
|
|
:return: A list of records, a dictionary of records, a list of keys.
|
|
"""
|
|
|
|
collection_records = []
|
|
collection_records_dict = {}
|
|
count_to_process_list = []
|
|
|
|
end = False
|
|
skip_tracker = 0
|
|
while not end:
|
|
process_collection_records = collection.data.query(skip=skip_tracker)
|
|
if process_collection_records:
|
|
for item in process_collection_records:
|
|
collection_records.append(item)
|
|
collection_records_dict[item.get("_key")] = (
|
|
item # Add the entire item to the dictionary
|
|
)
|
|
count_to_process_list.append(item.get("_key"))
|
|
skip_tracker += 5000
|
|
else:
|
|
end = True
|
|
|
|
return collection_records, collection_records_dict, count_to_process_list
|
|
|
|
def remove_ml_model(
|
|
self,
|
|
component,
|
|
rest_url,
|
|
header,
|
|
ml_model_lookup_name,
|
|
instance_id=None,
|
|
task_name=None,
|
|
task_instance_id=None,
|
|
):
|
|
"""
|
|
Removes an orphan Machine Learning model from the collection.
|
|
|
|
:param component: The component name.
|
|
:param rest_url: The REST URL to use.
|
|
:param header: The header to use.
|
|
:param ml_model_lookup_name: The Machine Learning model lookup name.
|
|
:param instance_id: The instance ID for logging purposes.
|
|
:param task_name: The task name for logging purposes.
|
|
:param task_instance_id: The task instance ID for logging purposes.
|
|
:return: True if the model was removed successfully, otherwise False.
|
|
|
|
"""
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{component}", attempting to delete orphan Machine Learning lookup_name="{ml_model_lookup_name}"'
|
|
)
|
|
try:
|
|
response = requests.delete(
|
|
rest_url,
|
|
headers=header,
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
if response.status_code not in (
|
|
200,
|
|
201,
|
|
204,
|
|
):
|
|
error_msg = f'failure to delete ML lookup_name="{ml_model_lookup_name}", url="{rest_url}", response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
raise Exception(error_msg)
|
|
else:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="success", deleted lookup_name="{ml_model_lookup_name}" successfully'
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
error_msg = f'failure to delete ML lookup_name="{ml_model_lookup_name}" with exception="{str(e)}"'
|
|
raise Exception(error_msg)
|
|
|
|
def reassign_ml_model(
|
|
self,
|
|
model_id,
|
|
rest_url,
|
|
header,
|
|
instance_id=None,
|
|
task_name=None,
|
|
task_instance_id=None,
|
|
):
|
|
"""
|
|
Reasign a Machine Learning model to the Splunk system user.
|
|
|
|
:param model_id: The model_id to reassign.
|
|
:param rest_url: The REST URL to use.
|
|
:param header: The header to use.
|
|
:param instance_id: The instance ID for logging purposes.
|
|
:param task_name: The task name for logging purposes.
|
|
:param task_instance_id: The task instance ID for logging purposes.
|
|
:return: True if the model was reassigned successfully, otherwise False.
|
|
|
|
"""
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting to re-assign model_id="{model_id}" to splunk-system-user'
|
|
)
|
|
|
|
acl_properties = {
|
|
"sharing": "user",
|
|
"owner": "splunk-system-user",
|
|
}
|
|
|
|
# proceed boolean
|
|
proceed = False
|
|
|
|
# before re-assigning, check if the model exist by running a GET request, if the status code is different from 2**, do not proceed and log an informational message instead
|
|
try:
|
|
response = requests.get(
|
|
f"{rest_url}",
|
|
headers=header,
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
if response.status_code not in (200, 201, 204):
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, model_id="{model_id}" does not exist, it might have been re-assigned in the meantime, skipping re-assignment'
|
|
)
|
|
return False
|
|
else:
|
|
proceed = True
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, model_id="{model_id}" failed to retrieve model, exception="{str(e)}"'
|
|
)
|
|
|
|
if proceed:
|
|
|
|
try:
|
|
response = requests.post(
|
|
f"{rest_url}/acl",
|
|
headers=header,
|
|
data=acl_properties,
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
if response.status_code not in (
|
|
200,
|
|
201,
|
|
204,
|
|
):
|
|
error_msg = f'failure to reassign model_id="{model_id}", url="{rest_url}", response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
raise Exception(error_msg)
|
|
else:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="success", model_id="{model_id}" reassigned successfully'
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, action="failure", model_id="{model_id}" reassigned failed, exception="{str(e)}"'
|
|
)
|
|
raise Exception(str(e))
|
|
|
|
def get_all_accounts(self, instance_id=None, task_name=None, task_instance_id=None):
|
|
"""
|
|
Update the configuration of any exising remote account, to ensure that the configuration is up to date.
|
|
|
|
:param instance_id: The instance ID for logging purposes.
|
|
:param task_name: The task name for logging purposes.
|
|
:param task_instance_id: The task instance ID for logging purposes.
|
|
:return: A list of remote accounts.
|
|
"""
|
|
|
|
# endpoint target
|
|
url = f"{self._metadata.searchinfo.splunkd_uri}/servicesNS/nobody/trackme/trackme_account"
|
|
|
|
# current_remote_accounts_list
|
|
current_remote_accounts_list = []
|
|
|
|
# first, get the list of remote accounts
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
headers={
|
|
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
verify=False,
|
|
params={
|
|
"output_mode": "json",
|
|
"count": -1,
|
|
},
|
|
timeout=600,
|
|
)
|
|
|
|
response.raise_for_status()
|
|
response_json = response.json()
|
|
|
|
# The list of remote accounts is stored as a list in entry
|
|
remote_accounts = response_json.get("entry", [])
|
|
|
|
# iterate through the remote accounts, adding them to the dict, name is the key, then we care about "content" which is a dict of our parameters
|
|
# for this account
|
|
|
|
for remote_account in remote_accounts:
|
|
remote_account_name = remote_account.get("name", None)
|
|
|
|
# add to list
|
|
current_remote_accounts_list.append(remote_account_name)
|
|
|
|
return current_remote_accounts_list
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, error while fetching remote account list: {str(e)}'
|
|
)
|
|
return []
|
|
|
|
#
|
|
# Main
|
|
#
|
|
|
|
def generate(self, **kwargs):
|
|
if self:
|
|
# performance counter
|
|
global_start = time.time()
|
|
|
|
# set instance_id
|
|
instance_id = self.get_uuid()
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# Splunk header for REST requests
|
|
header = {
|
|
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
logging.info( # First log message
|
|
f'context="general_execution", trackmegeneralhealthmanager is starting now.'
|
|
)
|
|
|
|
# global_results_dict to store results of the execution
|
|
global_results_dict = {}
|
|
|
|
# Register the object summary in the vtenant collection
|
|
collection_vtenants_name = "kv_trackme_virtual_tenants"
|
|
collection_vtenants = self.service.kvstore[collection_vtenants_name]
|
|
|
|
# get all vtenants records, this job is not tenant specific
|
|
vtenant_records = collection_vtenants.data.query()
|
|
|
|
############################################################
|
|
# Machine Learning related global health manager tasks
|
|
# Goals:
|
|
# - Inspect all ML collections, identify orphans models,
|
|
# and reassign if necessary
|
|
############################################################
|
|
|
|
# Reassignment: Ensures that all ML models are owned by splunk-system-user, amd re-assign otherwise
|
|
# run the following search to retrieve the list of existing ML models
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "mlmodels-management:splunk-system-user_reassignment"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# counters
|
|
ml_models_reassigned_success_count = 0
|
|
ml_models_reassigned_failures_count = 0
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting verification of ML models ownership and reassignment if necessary'
|
|
)
|
|
|
|
# Define the query
|
|
search = f'| rest splunk_server=local timeout=1200 "/servicesNS/nobody/trackme/data/lookup-table-files" | search eai:acl.app="trackme" AND title="__mlspl_model_*.mlmodel" | table title, id'
|
|
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# A list to store current ml models (filename)
|
|
ml_models_for_reassignement_current_list = []
|
|
|
|
# A dict to store the existing models
|
|
ml_models_for_reassignement_dict_existing = {}
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
ml_models_for_reassignement_current_list.append(
|
|
item.get("title")
|
|
) # this is the model filename
|
|
ml_models_for_reassignement_dict_existing[item.get("title")] = {
|
|
"id": item.get("id")
|
|
}
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the list of ML models, exception="{str(e)}"'
|
|
)
|
|
|
|
# Loop
|
|
for model_id in ml_models_for_reassignement_current_list:
|
|
|
|
# reassign the model
|
|
rest_url = ml_models_for_reassignement_dict_existing[model_id].get("id")
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting reassignment of model_id={model_id}"'
|
|
)
|
|
|
|
try:
|
|
reassigned_model = self.reassign_ml_model(
|
|
model_id,
|
|
rest_url,
|
|
header,
|
|
instance_id,
|
|
task_name,
|
|
task_instance_id,
|
|
)
|
|
if reassigned_model:
|
|
ml_models_reassigned_success_count += 1
|
|
except Exception as e:
|
|
ml_models_reassigned_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to reassign the model, model_id="{model_id}", exception="{str(e)}"'
|
|
)
|
|
|
|
############################################################
|
|
# Identify ML models owned by splunk-system-user
|
|
############################################################
|
|
|
|
# run the following search to retrieve the list of existing ML models
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting verification of ML models ownership and reassignment if necessary'
|
|
)
|
|
|
|
# Define the query
|
|
search = f'| rest splunk_server=local timeout=1200 "/servicesNS/splunk-system-user/trackme/data/lookup-table-files" | search eai:acl.app="trackme" AND title="__mlspl_model_*.mlmodel" | table title, id'
|
|
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# A list to store current ml models (filename)
|
|
ml_models_current_list = []
|
|
|
|
# A dict to store the existing models
|
|
ml_models_dict_existing = {}
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
ml_models_current_list.append(
|
|
item.get("title")
|
|
) # this is the model filename
|
|
ml_models_dict_existing[item.get("title")] = {
|
|
"id": item.get("id")
|
|
}
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the list of ML models, exception="{str(e)}"'
|
|
)
|
|
|
|
############################################################
|
|
# Identify ML models configured in TrackMe
|
|
############################################################
|
|
|
|
# A list to store ml_rules_outliers_collections
|
|
ml_rules_outliers_collections = []
|
|
|
|
# A dict to ml models definitions
|
|
ml_models_dict = {}
|
|
|
|
# A list to store ml models currently configured
|
|
ml_models_list = []
|
|
|
|
for vtenant_record in vtenant_records:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing vtenant_record={json.dumps(vtenant_record, indent=2)}'
|
|
)
|
|
|
|
# get the tenant_id
|
|
tenant_id = vtenant_record.get("tenant_id")
|
|
|
|
# for component in dsm, dhm, flx, fqm, wlk
|
|
for component in ["dsm", "dhm", "flx", "fqm", "wlk"]:
|
|
|
|
# get status
|
|
component_status = vtenant_record.get(f"tenant_{component}_enabled")
|
|
|
|
# append the collection
|
|
if component_status == 1:
|
|
ml_rules_outliers_collections.append(
|
|
f"kv_trackme_{component}_outliers_entity_rules_tenant_{tenant_id}"
|
|
)
|
|
|
|
# for each outliers rules collection
|
|
for ml_rules_collection_name in ml_rules_outliers_collections:
|
|
|
|
# connect to the collection service and retrieve the records
|
|
ml_rules_collection = self.service.kvstore[ml_rules_collection_name]
|
|
|
|
# extract ml_rules_tenant_id from the collection name: trackme_<component>_outliers_entity_rules_tenant_<ml_rules_tenant_id>
|
|
ml_rules_tenant_id = ml_rules_collection_name.split("_")[-1]
|
|
|
|
# get records
|
|
try:
|
|
ml_rules_records, ml_rules_records_dict, ml_rules_records_count = (
|
|
self.get_ml_rules_collection(ml_rules_collection)
|
|
)
|
|
|
|
for ml_rules_record in ml_rules_records:
|
|
|
|
# get key
|
|
ml_rules_record_key = ml_rules_record.get("_key")
|
|
|
|
# get dictionnary entities_outliers from the field entities_outliers
|
|
entities_outliers = json.loads(
|
|
ml_rules_record.get("entities_outliers")
|
|
)
|
|
|
|
# loop trough entities_outliers, the dict key is the model_id
|
|
for ml_model_entity in entities_outliers:
|
|
|
|
ml_models_dict[ml_model_entity] = {
|
|
"model_id": ml_model_entity,
|
|
"collection_name": ml_rules_collection_name,
|
|
"collection_key": ml_rules_record_key,
|
|
"tenant_id": ml_rules_tenant_id,
|
|
}
|
|
ml_models_list.append(
|
|
f"__mlspl_{ml_model_entity}.mlmodel"
|
|
) # this is the filename
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve the records from the collection, collection_name="{ml_rules_collection_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# log
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {len(ml_models_dict)} ML models were found configured in TrackMe collections, will now start inspecting Splunk existing models.'
|
|
)
|
|
|
|
# log the number of currently existing models
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {len(ml_models_current_list)} ML models were found in the system, starting orphan models inspection'
|
|
)
|
|
|
|
#
|
|
# orphan models purge / reassign
|
|
#
|
|
|
|
ml_models_purged_success_count = 0
|
|
ml_models_purged_failures_count = 0
|
|
|
|
# for each model in ml_models_current_list, if the model is not in ml_models_list, delete it
|
|
for model_id in ml_models_current_list:
|
|
if model_id not in ml_models_list and not model_id == "pending":
|
|
# remove the model
|
|
rest_url = ml_models_dict_existing[model_id].get("id")
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, attempting removal of model_id={model_id}"'
|
|
)
|
|
|
|
try:
|
|
self.remove_ml_model(
|
|
"trackme",
|
|
rest_url,
|
|
header,
|
|
model_id,
|
|
instance_id,
|
|
task_name,
|
|
task_instance_id,
|
|
)
|
|
ml_models_purged_success_count += 1
|
|
except Exception as e:
|
|
ml_models_purged_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to remove the orphan model, model_id="{model_id}", exception="{str(e)}"'
|
|
)
|
|
|
|
# end context="mlmodels-management"
|
|
|
|
# log
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {ml_models_purged_success_count} orphan ML models were removed, {ml_models_purged_failures_count} orphan ML models removals failed, {ml_models_reassigned_success_count} ML models were reassigned to splunk-system-user, {ml_models_reassigned_failures_count} ML models reassignments failed'
|
|
)
|
|
|
|
# add to results
|
|
global_results_dict["mlmodels_management"] = {
|
|
"ml_models_in_system_count": len(ml_models_current_list),
|
|
"ml_models_configured_count": len(ml_models_list),
|
|
"ml_models_purged_success_count": ml_models_purged_success_count,
|
|
"ml_models_purged_failures_count": ml_models_purged_failures_count,
|
|
"ml_models_reassigned_success_count": ml_models_reassigned_success_count,
|
|
"ml_models_reassigned_failures_count": ml_models_reassigned_failures_count,
|
|
"result": f"{ml_models_purged_success_count} orphan ML models were removed, {ml_models_purged_failures_count} orphan ML models removals failed, {ml_models_reassigned_success_count} ML models were reassigned to splunk-system-user, {ml_models_reassigned_failures_count} ML models reassignments failed",
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# End Machine Learning related global health manager tasks
|
|
############################################################
|
|
|
|
############################################################
|
|
# Splunk Remote Accounts maintenance
|
|
# Goals:
|
|
# - Calls the associated REST endpoint for each existing account,
|
|
# to verify, update account parameters if needed, and peform tokens
|
|
# rotation if needed
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "splunk-remote-accounts:verify_and_maintain_accounts"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, context="splunk-remote-accounts", starting verification and maintenance of Splunk remote accounts'
|
|
)
|
|
|
|
# get all accounts
|
|
current_remote_accounts_list = self.get_all_accounts(
|
|
instance_id, task_name, task_instance_id
|
|
)
|
|
|
|
# remote_accounts_maintenance_dict
|
|
remote_accounts_maintenance_dict = {}
|
|
|
|
# Loop through accounts, and call the endpoint
|
|
for account in current_remote_accounts_list:
|
|
|
|
url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/configuration/admin/maintain_remote_account"
|
|
|
|
try:
|
|
response = requests.post(
|
|
url,
|
|
headers={
|
|
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
verify=False,
|
|
data=json.dumps(
|
|
{
|
|
"accounts": account,
|
|
}
|
|
),
|
|
timeout=600,
|
|
)
|
|
|
|
response.raise_for_status()
|
|
response_json = response.json()
|
|
remote_accounts_maintenance_dict[account] = response_json
|
|
|
|
except Exception as e:
|
|
error_msg = f'error calling endpoint, exception="{str(e)}"'
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, {error_msg}'
|
|
)
|
|
remote_accounts_maintenance_dict[account] = error_msg
|
|
|
|
# add to global_results_dict, if the dict is empty, add a message to the global_results_dict as we had no actions to perform
|
|
if not remote_accounts_maintenance_dict:
|
|
global_results_dict[f"{task_name}"] = {
|
|
"message": "No actions to perform."
|
|
}
|
|
else:
|
|
global_results_dict[f"{task_name}"] = (
|
|
remote_accounts_maintenance_dict
|
|
)
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# TrackMe Virtual Tenants auto-repair
|
|
# Goals:
|
|
# - For each enable Virtual Tenant, verify that all expected
|
|
# are effectively available in the system. (KV collections...)
|
|
# - If for some reasons an expected object is missing,
|
|
# auto-repair will attempt to create it and fix the tenant inconsistency.
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "virtual_tenants:auto-repair:collections_and_transforms"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# init auto_repair_actions_list
|
|
auto_repair_actions_list = []
|
|
|
|
# A dict to store objects that were verified and their status, per tenant_id as the key
|
|
tenants_objects_status_dict = {}
|
|
|
|
# collections map per component, including common collections
|
|
collections_map_per_component = {
|
|
"dsm": collections_list_dsm,
|
|
"dhm": collections_list_dhm,
|
|
"mhm": collections_list_mhm,
|
|
"flx": collections_list_flx,
|
|
"fqm": collections_list_fqm,
|
|
"wlk": collections_list_wlk,
|
|
"common": collections_list_common, # Add common collections
|
|
}
|
|
|
|
for vtenant_record in vtenant_records:
|
|
|
|
# get the tenant_id
|
|
tenant_id = vtenant_record.get("tenant_id")
|
|
|
|
# check if tenant is a replica tenant, if so, skip it
|
|
if vtenant_record.get("tenant_replica", 0) == 1:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
|
|
)
|
|
continue
|
|
|
|
# init tenant_id_checked_status_dict
|
|
tenant_id_checked_status_dict = {}
|
|
|
|
# get RBAC
|
|
tenant_owner = str(vtenant_record.get("tenant_owner"))
|
|
tenant_roles_admin = str(vtenant_record.get("tenant_roles_admin"))
|
|
tenant_roles_user = str(vtenant_record.get("tenant_roles_user"))
|
|
tenant_roles_power = str(vtenant_record.get("tenant_roles_power"))
|
|
# TrackMe sharing level
|
|
trackme_default_sharing = reqinfo["trackme_conf"]["trackme_general"][
|
|
"trackme_default_sharing"
|
|
]
|
|
|
|
# for read permissions, concatenate admin, power and user
|
|
tenant_roles_read_perms = (
|
|
f"{tenant_roles_admin},{tenant_roles_power},{tenant_roles_user}"
|
|
)
|
|
|
|
# for write permissions, concatenate admin, power
|
|
tenant_roles_write_perms = f"{tenant_roles_admin},{tenant_roles_power}"
|
|
|
|
# for component in dsm, dhm, flx, fqm, wlk and common
|
|
for component in ["dsm", "dhm", "mhm", "flx", "fqm", "wlk", "common"]:
|
|
|
|
# get status
|
|
try:
|
|
component_status = int(
|
|
vtenant_record.get(
|
|
f"tenant_{component}_enabled", 1
|
|
) # Default to 1 for common
|
|
)
|
|
except Exception as e:
|
|
component_status = 0
|
|
|
|
# only continue if component is enabled
|
|
if component_status == 1:
|
|
|
|
# Handle collections
|
|
for object_name in collections_map_per_component[component]:
|
|
|
|
#
|
|
# Verify that the KV collection exists
|
|
#
|
|
|
|
kvstore_collection_name = (
|
|
f"kv_{object_name}_tenant_{tenant_id}"
|
|
)
|
|
kvstore_collection_exists = (
|
|
True # assume the collection exists
|
|
)
|
|
|
|
# check if the collection exists
|
|
try:
|
|
collection = self.service.kvstore[
|
|
kvstore_collection_name
|
|
]
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}'
|
|
)
|
|
tenant_id_checked_status_dict[
|
|
kvstore_collection_name
|
|
] = {
|
|
"result": "success",
|
|
"type": "kvstore_collection",
|
|
}
|
|
except Exception as e:
|
|
kvstore_collection_exists = False
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, failed to retrieve the collection, collection_name="{kvstore_collection_name}", exception="{str(e)}"'
|
|
)
|
|
tenant_id_checked_status_dict[
|
|
kvstore_collection_name
|
|
] = {
|
|
"result": "failure",
|
|
"exception": str(e),
|
|
"type": "kvstore_collection",
|
|
}
|
|
|
|
#
|
|
# Verify that the transform exists and contains the expected fields
|
|
#
|
|
|
|
transform_name = f"{object_name}_tenant_{tenant_id}"
|
|
transform_exists = True # assume the transform exists
|
|
transforms_fields_list_csv = None
|
|
transforms_fields_list = None
|
|
transforms_expected_fields_list_csv = collections_dict[
|
|
object_name
|
|
]
|
|
transforms_expected_fields_list = [
|
|
x.strip()
|
|
for x in transforms_expected_fields_list_csv.split(",")
|
|
]
|
|
transforms_has_missing_fields = False # assume False
|
|
|
|
# check if the transform exists
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, inspecting transform_name={transform_name}'
|
|
)
|
|
try:
|
|
transform = self.service.confs["transforms"][
|
|
transform_name
|
|
]
|
|
transforms_fields_list_csv = transform["fields_list"]
|
|
transforms_fields_list = (
|
|
[
|
|
x.strip()
|
|
for x in transforms_fields_list_csv.split(",")
|
|
]
|
|
if transforms_fields_list_csv
|
|
else []
|
|
)
|
|
|
|
# Verify that the transforms has at the minimum the expected fields
|
|
for expected_field in transforms_expected_fields_list:
|
|
if expected_field not in transforms_fields_list:
|
|
transforms_has_missing_fields = True
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_fields_list={transforms_fields_list}, transforms_has_missing_fields={transforms_has_missing_fields}'
|
|
)
|
|
except Exception as e:
|
|
transform_exists = False
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, failed to retrieve the transform, transform_name="{transform_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# temp logging
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, kvstore_collection_exists={kvstore_collection_exists}, transform_exists={transform_exists}, transform_fields_list={transforms_fields_list}, transforms_has_missing_fields={transforms_has_missing_fields}'
|
|
)
|
|
|
|
#
|
|
# Take action if needed
|
|
#
|
|
|
|
#
|
|
# KVstore collection
|
|
#
|
|
|
|
# If the KVstore collection does not exist, create it
|
|
if not kvstore_collection_exists:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, it will be created.'
|
|
)
|
|
|
|
# create the KVstore collection
|
|
url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/create_kvcollection'
|
|
data = {
|
|
"tenant_id": tenant_id,
|
|
"collection_name": kvstore_collection_name,
|
|
"collection_acl": {
|
|
"owner": tenant_owner,
|
|
"sharing": trackme_default_sharing,
|
|
"perms.write": tenant_roles_write_perms,
|
|
"perms.read": tenant_roles_read_perms,
|
|
},
|
|
"owner": tenant_owner,
|
|
}
|
|
|
|
try:
|
|
response = requests.post(
|
|
url,
|
|
headers={
|
|
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}"
|
|
},
|
|
data=json.dumps(data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
response.raise_for_status()
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, it has been created successfully.'
|
|
)
|
|
|
|
# add to auto_repair_actions_list
|
|
auto_repair_actions_list.append(
|
|
{
|
|
"action": "create_kvcollection",
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"collection_name": kvstore_collection_name,
|
|
"message": "KVstore collection was detected missing, it has been created successfully.",
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, collection_name={kvstore_collection_name}, kvstore_collection_exists={kvstore_collection_exists}, the KVstore collection was detected missing, attempt to create it has failed, exception="{str(e)}"'
|
|
)
|
|
|
|
# add to auto_repair_actions_list
|
|
auto_repair_actions_list.append(
|
|
{
|
|
"action": "create_kvcollection",
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"collection_name": kvstore_collection_name,
|
|
"message": "KVstore collection was detected missing, attempt to create it has failed.",
|
|
"exception": str(e),
|
|
}
|
|
)
|
|
|
|
#
|
|
# Transforms definition: If the transforms does not exist, create it, if it exists but has missing fields, it will be deleted and recreated
|
|
#
|
|
|
|
if transform_exists and not transforms_has_missing_fields:
|
|
tenant_id_checked_status_dict[transform_name] = {
|
|
"result": "success",
|
|
"type": "transform",
|
|
}
|
|
elif not transform_exists:
|
|
tenant_id_checked_status_dict[transform_name] = {
|
|
"result": "failure",
|
|
"exception": "The transform was detected missing.",
|
|
"type": "transform",
|
|
}
|
|
elif transform_exists and transforms_has_missing_fields:
|
|
tenant_id_checked_status_dict[transform_name] = {
|
|
"result": "failure",
|
|
"exception": "The transform was detected as existing but has missing fields.",
|
|
"type": "transform",
|
|
}
|
|
else:
|
|
tenant_id_checked_status_dict[transform_name] = {
|
|
"result": "unknown",
|
|
"transform_exists": transform_exists,
|
|
"transforms_has_missing_fields": transforms_has_missing_fields,
|
|
"type": "transform",
|
|
}
|
|
|
|
if not transform_exists or transforms_has_missing_fields:
|
|
|
|
if not transform_exists:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it will be created.'
|
|
)
|
|
|
|
if transform_exists and transforms_has_missing_fields:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it will be recreated.'
|
|
)
|
|
|
|
#
|
|
# delete the transform
|
|
#
|
|
|
|
url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/delete_kvtransform'
|
|
data = {
|
|
"tenant_id": tenant_id,
|
|
"transform_name": transform_name,
|
|
}
|
|
|
|
try:
|
|
response = requests.post(
|
|
url,
|
|
headers={
|
|
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}"
|
|
},
|
|
data=json.dumps(data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
response.raise_for_status()
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it has been deleted successfully.'
|
|
)
|
|
|
|
# add to auto_repair_actions_list
|
|
auto_repair_actions_list.append(
|
|
{
|
|
"action": "delete_kvtransform",
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"transform_name": transform_name,
|
|
"message": "The transform was detected as existing but has missing fields, it has been deleted successfully.",
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected as existing but has missing fields, it has been deleted successfully.'
|
|
)
|
|
|
|
# add to auto_repair_actions_list
|
|
auto_repair_actions_list.append(
|
|
{
|
|
"action": "delete_kvtransform",
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"transform_name": transform_name,
|
|
"message": "The transform was detected as existing but has missing fields, attempt to delete it has failed.",
|
|
"exception": str(e),
|
|
}
|
|
)
|
|
|
|
#
|
|
# create the transform
|
|
#
|
|
|
|
url = f'{reqinfo["server_rest_uri"]}/services/trackme/v2/configuration/admin/create_kvtransform'
|
|
data = {
|
|
"tenant_id": tenant_id,
|
|
"transform_name": transform_name,
|
|
"transform_fields": transforms_expected_fields_list_csv,
|
|
"collection_name": kvstore_collection_name,
|
|
"transform_acl": {
|
|
"owner": tenant_owner,
|
|
"sharing": trackme_default_sharing,
|
|
"perms.write": tenant_roles_write_perms,
|
|
"perms.read": tenant_roles_read_perms,
|
|
},
|
|
"owner": tenant_owner,
|
|
}
|
|
|
|
try:
|
|
response = requests.post(
|
|
url,
|
|
headers={
|
|
"Authorization": f"Splunk {self._metadata.searchinfo.session_key}"
|
|
},
|
|
data=json.dumps(data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
response.raise_for_status()
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it has been created successfully.'
|
|
)
|
|
|
|
# add to auto_repair_actions_list
|
|
auto_repair_actions_list.append(
|
|
{
|
|
"action": "create_kvtransform",
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"transform_name": transform_name,
|
|
"message": "The transform was detected missing or inconsistent, it has been created successfully.",
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component={component}, transforms_name={transform_name}, transforms_exists={transform_exists}, the transform was detected missing, it has been created successfully.'
|
|
)
|
|
|
|
# add to auto_repair_actions_list
|
|
auto_repair_actions_list.append(
|
|
{
|
|
"action": "create_kvtransform",
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"transform_name": transform_name,
|
|
"message": "The transform was detected missing or inconsistent, attempt to create it has failed.",
|
|
"exception": str(e),
|
|
}
|
|
)
|
|
|
|
# add to tenants_objects_status_dict
|
|
tenants_objects_status_dict[tenant_id] = tenant_id_checked_status_dict
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[f"{task_name}"] = {
|
|
"knowledge_objects_status": tenants_objects_status_dict,
|
|
"auto_repair_actions_list": (
|
|
auto_repair_actions_list
|
|
if auto_repair_actions_list
|
|
else "No actions to perform."
|
|
),
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# TrackMe Virtual Tenants configuration issues fixer
|
|
# Goals:
|
|
# - Run a Splunk search to identify Virtual Tenants with configuration issues (missing reports)
|
|
# - For each tenant found, identify enabled components from the central KVstore collection
|
|
# - For each tenant/component combination, run the REST API call to fix issues
|
|
# - Exclude replica tenants (tenant_replica = 1)
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "virtual_tenants:auto-repair:components_reports"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# Step 1: Run Splunk search to identify tenants with missing reports
|
|
search = remove_leading_spaces(
|
|
"""
|
|
search (index=_internal sourcetype=trackme:rest_api) OR (index=_internal sourcetype=trackme:custom_commands:*)
|
|
log_level=error
|
|
task="optimize_tenant_scheduled_reports"
|
|
"failure to get report report_name"
|
|
"urlencoded"
|
|
| stats count by tenant_id
|
|
""")
|
|
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-24h",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# Counters
|
|
tenants_with_issues_found = 0
|
|
tenants_processed = 0
|
|
tenants_fixed = 0
|
|
tenants_skipped = 0
|
|
total_components_fixed = 0
|
|
total_components_failed = 0
|
|
|
|
# Lists to store detailed information
|
|
tenants_with_issues = []
|
|
tenants_processed_details = []
|
|
rest_call_responses = []
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
tenant_id = item.get("tenant_id")
|
|
error_count = item.get("count", 0)
|
|
if tenant_id:
|
|
tenants_with_issues.append({
|
|
"tenant_id": tenant_id,
|
|
"error_count": error_count
|
|
})
|
|
tenants_with_issues_found += 1
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to run search for tenants with issues, exception="{str(e)}"'
|
|
)
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, found {tenants_with_issues_found} tenants with configuration issues'
|
|
)
|
|
|
|
# Step 2: Process each tenant found
|
|
for tenant_info in tenants_with_issues:
|
|
tenant_id = tenant_info["tenant_id"]
|
|
error_count = tenant_info["error_count"]
|
|
tenants_processed += 1
|
|
|
|
# Initialize tenant processing details
|
|
tenant_processing_detail = {
|
|
"tenant_id": tenant_id,
|
|
"error_count": error_count,
|
|
"enabled_components": [],
|
|
"components_fixed": 0,
|
|
"components_failed": 0,
|
|
"is_replica": False,
|
|
"skipped_reason": None,
|
|
"processing_status": "processing"
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing tenant {tenant_id} with {error_count} errors'
|
|
)
|
|
|
|
# Get tenant record from central KVstore collection
|
|
try:
|
|
# Find the tenant record in vtenant_records (already loaded)
|
|
tenant_record = None
|
|
for vtenant_record in vtenant_records:
|
|
if vtenant_record.get("tenant_id") == tenant_id:
|
|
tenant_record = vtenant_record
|
|
break
|
|
|
|
if not tenant_record:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, tenant record not found in central collection, skipping'
|
|
)
|
|
tenant_processing_detail["skipped_reason"] = "tenant record not found in central collection"
|
|
tenant_processing_detail["processing_status"] = "skipped"
|
|
tenants_processed_details.append(tenant_processing_detail)
|
|
tenants_skipped += 1
|
|
continue
|
|
|
|
# Check if tenant is a replica (exclude if so)
|
|
try:
|
|
tenant_replica = int(tenant_record.get("tenant_replica", 0))
|
|
except Exception as e:
|
|
tenant_replica = 0
|
|
|
|
tenant_processing_detail["is_replica"] = (tenant_replica == 1)
|
|
|
|
if tenant_replica == 1:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping'
|
|
)
|
|
tenant_processing_detail["skipped_reason"] = "replica tenant"
|
|
tenant_processing_detail["processing_status"] = "skipped"
|
|
tenants_processed_details.append(tenant_processing_detail)
|
|
tenants_skipped += 1
|
|
continue
|
|
|
|
# Get enabled components for this tenant
|
|
enabled_components = []
|
|
component_fields = {
|
|
"dsm": "tenant_dsm_enabled",
|
|
"dhm": "tenant_dhm_enabled",
|
|
"mhm": "tenant_mhm_enabled",
|
|
"flx": "tenant_flx_enabled",
|
|
"wlk": "tenant_wlk_enabled",
|
|
"fqm": "tenant_fqm_enabled"
|
|
}
|
|
|
|
for component, field_name in component_fields.items():
|
|
try:
|
|
if int(tenant_record.get(field_name, 0)) == 1:
|
|
enabled_components.append(component)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
tenant_processing_detail["enabled_components"] = enabled_components
|
|
|
|
if not enabled_components:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, has no enabled components, skipping'
|
|
)
|
|
tenant_processing_detail["skipped_reason"] = "no enabled components"
|
|
tenant_processing_detail["processing_status"] = "skipped"
|
|
tenants_processed_details.append(tenant_processing_detail)
|
|
tenants_skipped += 1
|
|
continue
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, has enabled components: {enabled_components}'
|
|
)
|
|
|
|
# Step 3: Fix each enabled component
|
|
tenant_components_fixed = 0
|
|
tenant_components_failed = 0
|
|
|
|
for component in enabled_components:
|
|
try:
|
|
# Prepare the REST API call
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/check_component_tenant"
|
|
|
|
payload = {
|
|
"tenant_id": tenant_id,
|
|
"component_target": component,
|
|
"update_comment": f"Automated fix for missing reports - general health manager task"
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, fixing component {component}'
|
|
)
|
|
|
|
# Make the REST call
|
|
response = requests.post(
|
|
target_url,
|
|
headers=header,
|
|
data=json.dumps(payload),
|
|
verify=False,
|
|
timeout=600
|
|
)
|
|
|
|
# Store REST call response details
|
|
rest_call_response = {
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"status_code": response.status_code,
|
|
"success": response.status_code == 200,
|
|
"response_text": response.text,
|
|
"timestamp": time.time()
|
|
}
|
|
|
|
# Try to parse JSON response if possible
|
|
try:
|
|
rest_call_response["response_json"] = response.json()
|
|
except:
|
|
rest_call_response["response_json"] = None
|
|
|
|
rest_call_responses.append(rest_call_response)
|
|
|
|
if response.status_code == 200:
|
|
tenant_components_fixed += 1
|
|
total_components_fixed += 1
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, successfully fixed'
|
|
)
|
|
else:
|
|
tenant_components_failed += 1
|
|
total_components_failed += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, failed to fix: {response.status_code} - {response.text}'
|
|
)
|
|
|
|
except Exception as e:
|
|
tenant_components_failed += 1
|
|
total_components_failed += 1
|
|
|
|
# Store exception details
|
|
rest_call_response = {
|
|
"tenant_id": tenant_id,
|
|
"component": component,
|
|
"status_code": None,
|
|
"success": False,
|
|
"response_text": str(e),
|
|
"timestamp": time.time(),
|
|
"exception": True
|
|
}
|
|
rest_call_responses.append(rest_call_response)
|
|
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, component {component}, exception during fix: {str(e)}'
|
|
)
|
|
|
|
if tenant_components_fixed > 0:
|
|
tenants_fixed += 1
|
|
|
|
# Update tenant processing details
|
|
tenant_processing_detail["components_fixed"] = tenant_components_fixed
|
|
tenant_processing_detail["components_failed"] = tenant_components_failed
|
|
tenant_processing_detail["processing_status"] = "completed"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, completed: {tenant_components_fixed} components fixed, {tenant_components_failed} components failed'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, exception during processing: {str(e)}'
|
|
)
|
|
tenant_processing_detail["skipped_reason"] = f"exception during processing: {str(e)}"
|
|
tenant_processing_detail["processing_status"] = "error"
|
|
tenants_skipped += 1
|
|
|
|
# Always add the tenant processing detail to the list
|
|
tenants_processed_details.append(tenant_processing_detail)
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[f"{task_name}"] = {
|
|
"tenants_with_issues_found": tenants_with_issues_found,
|
|
"tenants_processed": tenants_processed,
|
|
"tenants_fixed": tenants_fixed,
|
|
"tenants_skipped": tenants_skipped,
|
|
"total_components_fixed": total_components_fixed,
|
|
"total_components_failed": total_components_failed,
|
|
"tenants_with_issues": tenants_with_issues,
|
|
"tenants_processed_details": tenants_processed_details,
|
|
"rest_call_responses": rest_call_responses,
|
|
"result": f"{tenants_with_issues_found} tenants with issues found, {tenants_processed} processed, {tenants_fixed} fixed, {tenants_skipped} skipped, {total_components_fixed} components fixed, {total_components_failed} components failed",
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# TrackMe Virtual Tenants Check Health Tracker
|
|
# Goals:
|
|
# - If the tenant is enabled, then the health tracker should be enabled and scheduled.
|
|
# - If the tenant is disabled, then the health tracker should be disabled.
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "virtual_tenants:check_health_tracker"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
def manage_savedsearch_schedule(
|
|
tenant_id, savedsearch_names, feature_enabled, feature_name
|
|
):
|
|
"""
|
|
Helper function to manage saved search scheduling based on feature enablement.
|
|
|
|
Args:
|
|
savedsearch_names: List of saved search names to manage
|
|
feature_enabled: Boolean indicating if the feature should be enabled
|
|
feature_name: String name of the feature for logging purposes
|
|
"""
|
|
for savedsearch_name in savedsearch_names:
|
|
# get the status of the savedsearch
|
|
savedsearch_properties, savedsearch_acl = (
|
|
trackme_manage_report_schedule(
|
|
logging,
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id,
|
|
savedsearch_name,
|
|
action="status",
|
|
)
|
|
)
|
|
|
|
# log
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", savedsearch_properties="{json.dumps(savedsearch_properties, indent=2)}", savedsearch_acl="{json.dumps(savedsearch_acl, indent=2)}"'
|
|
)
|
|
|
|
# get the disabled status
|
|
disabled = int(savedsearch_properties.get("disabled", 0))
|
|
|
|
# get the is_scheduled status
|
|
is_scheduled = int(savedsearch_properties.get("is_scheduled", 0))
|
|
|
|
# Check tenant status first - if tenant is disabled, ensure health tracker is disabled
|
|
if feature_enabled == False:
|
|
# Tenant is disabled - ensure health tracker is disabled (but keep it scheduled)
|
|
if disabled == 0:
|
|
# Report is enabled - disable it
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is disabled, disabling savedsearch.'
|
|
)
|
|
try:
|
|
trackme_report_update_enablement(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id,
|
|
savedsearch_name,
|
|
"disable",
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch disabled successfully.'
|
|
)
|
|
return {
|
|
"action": "disable_savedsearch",
|
|
"tenant_id": tenant_id,
|
|
"savedsearch_name": savedsearch_name,
|
|
"message": "The savedsearch has been disabled successfully.",
|
|
}
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to disable savedsearch, exception="{str(e)}"'
|
|
)
|
|
else:
|
|
# Report is already disabled - nothing to do
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is disabled and savedsearch is already disabled, nothing to do.'
|
|
)
|
|
return {
|
|
"action": "nothing_to_do",
|
|
"tenant_id": tenant_id,
|
|
"savedsearch_name": savedsearch_name,
|
|
"message": "Tenant is disabled and savedsearch is already disabled, nothing to do.",
|
|
}
|
|
|
|
# Tenant is enabled - ensure health tracker is enabled AND scheduled
|
|
elif feature_enabled == True:
|
|
# Track if we performed any actions
|
|
action_performed = False
|
|
action_message = ""
|
|
|
|
# Check if we need to enable the report
|
|
if disabled == 1:
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", the report is currently disabled and needs to be enabled.'
|
|
)
|
|
try:
|
|
trackme_report_update_enablement(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id,
|
|
savedsearch_name,
|
|
"enable",
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch enabled successfully.'
|
|
)
|
|
action_performed = True
|
|
action_message = "The savedsearch has been enabled successfully"
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to enable the savedsearch, exception="{str(e)}"'
|
|
)
|
|
# stop here if we had an exception enabling the savedsearch
|
|
continue
|
|
|
|
# Check if we need to schedule the report
|
|
if is_scheduled == 0:
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", the report needs to be scheduled.'
|
|
)
|
|
try:
|
|
savedsearch_properties, savedsearch_acl = (
|
|
trackme_manage_report_schedule(
|
|
logging,
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id,
|
|
savedsearch_name,
|
|
action="enable",
|
|
)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch scheduled successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"'
|
|
)
|
|
action_performed = True
|
|
if action_message:
|
|
action_message += " and scheduled successfully."
|
|
else:
|
|
action_message = "The savedsearch has been scheduled successfully."
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to schedule savedsearch, exception="{str(e)}"'
|
|
)
|
|
|
|
# Return appropriate result based on actions performed
|
|
if action_performed:
|
|
return {
|
|
"action": "enable_savedsearch",
|
|
"tenant_id": tenant_id,
|
|
"savedsearch_name": savedsearch_name,
|
|
"message": action_message,
|
|
}
|
|
else:
|
|
# Report is already enabled and scheduled - nothing to do
|
|
logging.info(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", disabled="{disabled}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", tenant is enabled and savedsearch is already enabled and scheduled, nothing to do.'
|
|
)
|
|
return {
|
|
"action": "nothing_to_do",
|
|
"tenant_id": tenant_id,
|
|
"savedsearch_name": savedsearch_name,
|
|
"message": "Tenant is enabled and savedsearch is already enabled and scheduled, nothing to do.",
|
|
}
|
|
|
|
else:
|
|
# This should not happen as we've covered all cases above
|
|
logging.warning(
|
|
f'tenant_id="{tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", unexpected state, nothing to do.'
|
|
)
|
|
return {
|
|
"action": "nothing_to_do",
|
|
"tenant_id": tenant_id,
|
|
"savedsearch_name": savedsearch_name,
|
|
"message": "Unexpected state, nothing to do.",
|
|
}
|
|
|
|
# init auto_repair_actions_list
|
|
auto_repair_actions_list = []
|
|
|
|
# A dict to store objects that were verified and their status, per tenant_id as the key
|
|
tenants_objects_status_dict = {}
|
|
|
|
for vtenant_record in vtenant_records:
|
|
|
|
# get the tenant_id
|
|
tenant_id = vtenant_record.get("tenant_id")
|
|
|
|
# get the tenant_status (enabled/disabled)
|
|
tenant_status = vtenant_record.get("tenant_status", "enabled")
|
|
|
|
# check if tenant is a replica tenant, if so, skip it
|
|
if vtenant_record.get("tenant_replica", 0) == 1:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
|
|
)
|
|
continue
|
|
|
|
# init health_tracker_report_name
|
|
health_tracker_report_name = (
|
|
f"trackme_health_tracker_tenant_{tenant_id}"
|
|
)
|
|
health_tracker_check_result = {}
|
|
|
|
try:
|
|
# Determine if health tracker should be enabled based on tenant status
|
|
health_tracker_enabled = (tenant_status == "enabled")
|
|
|
|
health_tracker_check_result = manage_savedsearch_schedule(
|
|
tenant_id, [health_tracker_report_name], health_tracker_enabled, "health_tracker"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
# add to tenants_objects_status_dict
|
|
tenants_objects_status_dict[tenant_id] = health_tracker_check_result
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[f"{task_name}"] = {
|
|
"health_tracker_check_result": tenants_objects_status_dict,
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# TrackMe Virtual Tenants stateful alerts records expiration
|
|
# Goals:
|
|
# - For each enable Virtual Tenant, search for closed stateful alerts records
|
|
# in the KVstore collection, and delete them if they are older than 30 days.
|
|
# When purging statefule alerts records, search and purge associated charts records. (if any)
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "virtual_tenants:stateful_alerts_records_expiration"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# get the stateful records expiration days
|
|
stateful_records_expiration_days = int(
|
|
reqinfo["trackme_conf"]["trackme_general"][
|
|
"trackme_stateful_records_expiration_days"
|
|
]
|
|
)
|
|
|
|
# counters
|
|
expired_statefule_records_deleted_count = 0
|
|
expired_associated_charts_records_deleted_count = 0
|
|
orphans_charts_records_deleted_count = 0
|
|
for vtenant_record in vtenant_records:
|
|
|
|
# get the tenant_id
|
|
tenant_id = vtenant_record.get("tenant_id")
|
|
|
|
# check if tenant is a replica tenant, if so, skip it
|
|
if vtenant_record.get("tenant_replica", 0) == 1:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
|
|
)
|
|
continue
|
|
|
|
# Define the query
|
|
search = remove_leading_spaces(
|
|
f"""
|
|
| inputlookup trackme_stateful_alerting_tenant_{tenant_id} where alert_status="closed" | eval keyid=_key
|
|
| eval record_age=now()-ctime, is_expired=if(record_age > 86400*{stateful_records_expiration_days}, 1, 0)
|
|
| where is_expired=1
|
|
| table keyid, incident_id
|
|
"""
|
|
)
|
|
|
|
# A list to stored expired incident_id
|
|
expired_incident_id_list = []
|
|
|
|
# A list to store expired records
|
|
expired_records_list = []
|
|
|
|
# A list to store expired associated charts records
|
|
expired_associated_charts_records_list = []
|
|
|
|
# A list to store orphans charts records
|
|
orphans_charts_records = []
|
|
|
|
# Run the search
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
|
|
)
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
{
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
},
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
expired_records_list.append(item.get("keyid"))
|
|
expired_incident_id_list.append(item.get("incident_id"))
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, records for incident_id={item.get("incident_id")} have been detected as expired and will be deleted from the KVstore collections, keyid={item.get("keyid")}'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of expired records, exception="{str(e)}"'
|
|
)
|
|
|
|
# If nothing to do, continue
|
|
if not expired_records_list:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no expired records to process, skipping.'
|
|
)
|
|
continue
|
|
|
|
else:
|
|
|
|
# Run a new search to retrieve the list of associated charts records
|
|
|
|
# Convert the list to a CSV string filtered
|
|
expired_records_filtered_csv = (
|
|
f"({','.join(expired_incident_id_list)})"
|
|
)
|
|
|
|
search = remove_leading_spaces(
|
|
f"""
|
|
| inputlookup trackme_stateful_alerting_charts_tenant_{tenant_id} where incident_id IN {expired_records_filtered_csv} | eval keyid=_key
|
|
| table keyid, incident_id
|
|
"""
|
|
)
|
|
|
|
# Run the search
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
|
|
)
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
{
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
},
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
expired_associated_charts_records_list.append(
|
|
item.get("keyid")
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of associated charts records, exception="{str(e)}"'
|
|
)
|
|
|
|
# Run a new search to retrieve the list of orphans charts records
|
|
search = remove_leading_spaces(
|
|
f"""
|
|
| inputlookup trackme_stateful_alerting_charts_tenant_{tenant_id} | eval keyid=_key
|
|
| lookup trackme_stateful_alerting_tenant_{tenant_id} incident_id AS incident_id OUTPUT incident_id as parent_incident_id
|
|
| where (isnull(parent_incident_id) OR parent_incident_id="")
|
|
| table keyid, incident_id
|
|
"""
|
|
)
|
|
|
|
# Run the search
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
|
|
)
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
{
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
},
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
orphans_charts_records.append(item.get("keyid"))
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of orphans charts records, exception="{str(e)}"'
|
|
)
|
|
|
|
# Purge expired records from the stateful collection, if any
|
|
if expired_records_list:
|
|
|
|
# connect to the collection
|
|
collection_stateful_alerting_name = (
|
|
f"kv_trackme_stateful_alerting_tenant_{tenant_id}"
|
|
)
|
|
collection_stateful_alerting = self.service.kvstore[
|
|
collection_stateful_alerting_name
|
|
]
|
|
|
|
# for each expired record, delete the record from the stateful collection
|
|
for expired_record in expired_records_list:
|
|
|
|
try:
|
|
# Remove the record
|
|
collection_stateful_alerting.data.delete(
|
|
json.dumps({"_key": expired_record})
|
|
)
|
|
expired_statefule_records_deleted_count += 1
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# Purge expired associated charts records from the stateful collection, if any
|
|
if expired_associated_charts_records_list:
|
|
|
|
# connect to the collection
|
|
collection_stateful_alerting_name = (
|
|
f"kv_trackme_stateful_alerting_charts_tenant__{tenant_id}"
|
|
)
|
|
|
|
# for each expired record, delete the record from the stateful collection
|
|
for expired_record in expired_associated_charts_records_list:
|
|
|
|
try:
|
|
# Remove the record
|
|
collection_stateful_alerting.data.delete(
|
|
json.dumps({"_key": expired_record})
|
|
)
|
|
expired_associated_charts_records_deleted_count += 1
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# Purge orphans charts records from the stateful collection, if any
|
|
if orphans_charts_records:
|
|
|
|
# connect to the collection
|
|
collection_stateful_alerting_name = (
|
|
f"kv_trackme_stateful_alerting_charts_tenant__{tenant_id}"
|
|
)
|
|
|
|
# for each expired record, delete the record from the stateful collection
|
|
for orphan_record in orphans_charts_records:
|
|
|
|
try:
|
|
# Remove the record
|
|
collection_stateful_alerting.data.delete(
|
|
json.dumps({"_key": orphan_record})
|
|
)
|
|
expired_associated_charts_records_deleted_count += 1
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete the record from collection="{collection_stateful_alerting_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[
|
|
f"{task_name}"
|
|
] = {
|
|
"expired_statefule_records_deleted_count": expired_statefule_records_deleted_count,
|
|
"expired_associated_charts_records_deleted_count": expired_associated_charts_records_deleted_count,
|
|
"orphans_charts_records_deleted_count": orphans_charts_records_deleted_count,
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# TrackMe Virtual Tenants stateful alerts duplicate opened incidents cleanup
|
|
# Goals:
|
|
# - For each enabled Virtual Tenant, verify that for a given object_id,
|
|
# there should not be more than one opened incident (alert_status="opened") in the KVstore
|
|
# - If there are more than one incident_id for the same object_id, keep only the latest
|
|
# (based on the field mtime which is the epochtime of the last modification of the incident_id),
|
|
# other records should be updated with alert_status="closed"
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = (
|
|
"virtual_tenants:stateful_alerts_duplicate_opened_incidents_cleanup"
|
|
)
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# counters
|
|
duplicate_opened_incidents_found_count = 0
|
|
duplicate_opened_incidents_resolved_count = 0
|
|
duplicate_opened_incidents_resolution_failures_count = 0
|
|
|
|
for vtenant_record in vtenant_records:
|
|
|
|
# get the tenant_id
|
|
tenant_id = vtenant_record.get("tenant_id")
|
|
|
|
# check if tenant is a replica tenant, if so, skip it
|
|
if vtenant_record.get("tenant_replica", 0) == 1:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
|
|
)
|
|
continue
|
|
|
|
# Define the query to find duplicate opened incidents for the same object_id
|
|
search = remove_leading_spaces(
|
|
f"""
|
|
| inputlookup trackme_stateful_alerting_tenant_{tenant_id} where alert_status="opened" | eval keyid=_key
|
|
| eval _time=mtime
|
|
| stats count as incident_count, values(incident_id) as incident_ids, latest(incident_id) as latest_incident_id, latest(keyid) as latest_keyid, values(keyid) as keyids, max(mtime) as max_mtime by object_id
|
|
| where incident_count > 1
|
|
| eval to_close_keyids=mvmap(keyids, if(mvfind(keyids, "^\\\\"" + latest_keyid + "\\$")=0, null(), keyids))
|
|
| eval to_close_incident_ids=mvmap(incident_ids, if(mvfind(incident_ids, "^\\\\"" + latest_incident_id + "\\$")=0, null(), incident_ids))
|
|
| table object_id, incident_count, latest_incident_id, latest_keyid, max_mtime, to_close_keyids, to_close_incident_ids
|
|
"""
|
|
)
|
|
|
|
# A list to store duplicate opened incidents data
|
|
duplicate_opened_incidents_list = []
|
|
|
|
# Run the search
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, running search="{search}"'
|
|
)
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
{
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
},
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
duplicate_opened_incidents_list.append(item)
|
|
duplicate_opened_incidents_found_count += 1
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, found duplicate opened incidents for object_id={item.get("object_id")}, incident_count={item.get("incident_count")}, latest_incident_id={item.get("latest_incident_id")}, latest_keyid={item.get("latest_keyid")}, max_mtime={item.get("max_mtime")}'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to retrieve the list of duplicate opened incidents, exception="{str(e)}"'
|
|
)
|
|
|
|
# If nothing to do, continue
|
|
if not duplicate_opened_incidents_list:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no duplicate opened incidents to process, skipping.'
|
|
)
|
|
continue
|
|
|
|
# Process duplicate opened incidents
|
|
for duplicate_incident in duplicate_opened_incidents_list:
|
|
|
|
object_id = duplicate_incident.get("object_id")
|
|
to_close_keyids = duplicate_incident.get("to_close_keyids", [])
|
|
# if not a list, turn into a list csv
|
|
if not isinstance(to_close_keyids, list):
|
|
to_close_keyids = to_close_keyids.split(",")
|
|
to_close_incident_ids = duplicate_incident.get(
|
|
"to_close_incident_ids", []
|
|
)
|
|
# if not a list, turn into a list csv
|
|
if not isinstance(to_close_incident_ids, list):
|
|
to_close_incident_ids = to_close_incident_ids.split(",")
|
|
|
|
# Parse the to_close_keyids and to_close_incident_ids
|
|
if to_close_keyids:
|
|
to_close_keyids_list = [
|
|
keyid.strip()
|
|
for keyid in to_close_keyids
|
|
if keyid.strip()
|
|
]
|
|
else:
|
|
to_close_keyids_list = []
|
|
|
|
if to_close_incident_ids:
|
|
to_close_incident_ids_list = [
|
|
incident_id.strip()
|
|
for incident_id in to_close_incident_ids
|
|
if incident_id.strip()
|
|
]
|
|
else:
|
|
to_close_incident_ids_list = []
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, will close {len(to_close_keyids_list)} duplicate incidents: keyids={to_close_keyids_list}, incident_ids={to_close_incident_ids_list}'
|
|
)
|
|
|
|
# Connect to the collection
|
|
collection_stateful_alerting_name = (
|
|
f"kv_trackme_stateful_alerting_tenant_{tenant_id}"
|
|
)
|
|
collection_stateful_alerting = self.service.kvstore[
|
|
collection_stateful_alerting_name
|
|
]
|
|
|
|
# Update each duplicate incident to closed status
|
|
for keyid in to_close_keyids_list:
|
|
try:
|
|
|
|
# Get the current record
|
|
record_list = collection_stateful_alerting.data.query(
|
|
query=json.dumps({"_key": keyid})
|
|
)
|
|
|
|
if record_list and len(record_list) > 0:
|
|
# Extract the first (and should be only) record from the list
|
|
record = record_list[0]
|
|
# Update the record to closed status
|
|
record["alert_status"] = "closed"
|
|
record["mtime"] = int(
|
|
time.time()
|
|
) # Update modification time
|
|
|
|
# Update the record in the collection
|
|
collection_stateful_alerting.data.update(
|
|
keyid, json.dumps(record)
|
|
)
|
|
duplicate_opened_incidents_resolved_count += 1
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, successfully closed duplicate incident with keyid={keyid}'
|
|
)
|
|
else:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, record with keyid={keyid} not found in collection'
|
|
)
|
|
|
|
except Exception as e:
|
|
duplicate_opened_incidents_resolution_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, object_id={object_id}, failed to close duplicate incident with keyid={keyid}, exception="{str(e)}"'
|
|
)
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[
|
|
f"{task_name}"
|
|
] = {
|
|
"duplicate_opened_incidents_found_count": duplicate_opened_incidents_found_count,
|
|
"duplicate_opened_incidents_resolved_count": duplicate_opened_incidents_resolved_count,
|
|
"duplicate_opened_incidents_resolution_failures_count": duplicate_opened_incidents_resolution_failures_count,
|
|
"result": f"{duplicate_opened_incidents_found_count} duplicate opened incidents found, {duplicate_opened_incidents_resolved_count} resolved successfully, {duplicate_opened_incidents_resolution_failures_count} resolution failures",
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# TrackMe Virtual Tenants stateful charts records expiration
|
|
# Goals:
|
|
# - For each Virtual tenant, purge any record in the stateful charts KVstore collection:
|
|
# trackme_stateful_alerting_charts_tenant_<tenant_id>
|
|
# - For each KVrecord which is equal or older to 48 hours, based on the field "ctime"
|
|
# of the record which contains the epochtime of its creation
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "virtual_tenants:stateful_charts_records_expiration"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# get the stateful records expiration days
|
|
stateful_records_expiration_days = int(
|
|
reqinfo["trackme_conf"]["trackme_general"][
|
|
"trackme_stateful_charts_records_expiration_days"
|
|
]
|
|
)
|
|
|
|
# Define the expiration threshold (based on the expiration days)
|
|
charts_records_expiration_seconds = stateful_records_expiration_days * 24 * 3600
|
|
current_time = time.time()
|
|
|
|
# counters
|
|
expired_charts_records_deleted_count = 0
|
|
expired_charts_records_deletion_failures_count = 0
|
|
|
|
for vtenant_record in vtenant_records:
|
|
|
|
# get the tenant_id
|
|
tenant_id = vtenant_record.get("tenant_id")
|
|
|
|
# check if tenant is a replica tenant, if so, skip it
|
|
if vtenant_record.get("tenant_replica", 0) == 1:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, is a replica tenant, skipping.'
|
|
)
|
|
continue
|
|
|
|
# connect to the stateful charts collection
|
|
collection_stateful_charts_name = (
|
|
f"kv_trackme_stateful_alerting_charts_tenant_{tenant_id}"
|
|
)
|
|
|
|
try:
|
|
collection_stateful_charts = self.service.kvstore[collection_stateful_charts_name]
|
|
|
|
# get all records from the collection
|
|
(
|
|
charts_records,
|
|
charts_collection_keys,
|
|
charts_collection_dict,
|
|
) = get_full_kv_collection(
|
|
collection_stateful_charts, collection_stateful_charts_name
|
|
)
|
|
|
|
# A list to store expired records to delete
|
|
expired_charts_records_list = []
|
|
|
|
# Process each record to check if it's older than 48 hours
|
|
for record in charts_records:
|
|
try:
|
|
# Get the ctime field and convert from string to float if needed
|
|
ctime_str = record.get("ctime")
|
|
if ctime_str is None:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, record with key={record.get("_key")} has no ctime field, skipping.'
|
|
)
|
|
continue
|
|
|
|
# Convert string to float if needed
|
|
if isinstance(ctime_str, str):
|
|
ctime_float = float(ctime_str)
|
|
else:
|
|
ctime_float = float(ctime_str)
|
|
|
|
# Calculate age in seconds
|
|
record_age_seconds = current_time - ctime_float
|
|
|
|
# Check if record is older than 48 hours
|
|
if record_age_seconds >= charts_records_expiration_seconds:
|
|
expired_charts_records_list.append(record.get("_key"))
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, record with key={record.get("_key")} is {round(record_age_seconds/3600, 2)} hours old (>= 48 hours), will be deleted.'
|
|
)
|
|
|
|
except (ValueError, TypeError) as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to process ctime field for record with key={record.get("_key")}, ctime="{ctime_str}", exception="{str(e)}"'
|
|
)
|
|
continue
|
|
|
|
# If no expired records, continue to next tenant
|
|
if not expired_charts_records_list:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, no expired charts records to process, skipping.'
|
|
)
|
|
continue
|
|
|
|
# Delete expired records from the collection
|
|
for expired_record_key in expired_charts_records_list:
|
|
try:
|
|
# Remove the record
|
|
collection_stateful_charts.data.delete(
|
|
json.dumps({"_key": expired_record_key})
|
|
)
|
|
expired_charts_records_deleted_count += 1
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, successfully deleted expired charts record with key={expired_record_key}'
|
|
)
|
|
|
|
except Exception as e:
|
|
expired_charts_records_deletion_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to delete expired charts record with key={expired_record_key}, exception="{str(e)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id={tenant_id}, failed to access collection="{collection_stateful_charts_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[f"{task_name}"] = {
|
|
"expired_charts_records_deleted_count": expired_charts_records_deleted_count,
|
|
"expired_charts_records_deletion_failures_count": expired_charts_records_deletion_failures_count,
|
|
"result": f"{expired_charts_records_deleted_count} expired charts records deleted, {expired_charts_records_deletion_failures_count} deletion failures",
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
############################################################
|
|
# Recurring Bank Holidays Management
|
|
# Goals:
|
|
# - Process recurring bank holidays and create future occurrences
|
|
# - Handle holidays that span across years (e.g., Dec 31 - Jan 1)
|
|
# - Clean up past bank holiday periods that have already ended
|
|
############################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "bank-holidays:recurring_periods_management"
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# counters
|
|
recurring_holidays_processed_count = 0
|
|
new_periods_created_count = 0
|
|
periods_creation_failures_count = 0
|
|
past_periods_deleted_count = 0
|
|
past_periods_deletion_failures_count = 0
|
|
|
|
try:
|
|
# Connect to bank holidays collection
|
|
collection_name = "kv_trackme_bank_holidays"
|
|
collection = self.service.kvstore[collection_name]
|
|
|
|
# Get current time and year
|
|
current_time = time.time()
|
|
current_year = datetime.datetime.fromtimestamp(current_time, tz=datetime.timezone.utc).year
|
|
|
|
############################################################
|
|
# Step 1: Clean up past bank holiday periods
|
|
# Strategy:
|
|
# - For recurring holidays: Keep the oldest one for each pattern (template), delete other past duplicates
|
|
# - For non-recurring holidays: Delete all that are past
|
|
############################################################
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting cleanup of past bank holiday periods.'
|
|
)
|
|
|
|
try:
|
|
# Get all bank holidays
|
|
all_holidays = collection.data.query()
|
|
|
|
# Group recurring holidays by pattern to identify templates
|
|
recurring_by_pattern = {}
|
|
non_recurring_past = []
|
|
|
|
for holiday in all_holidays:
|
|
holiday_dict = dict(holiday)
|
|
holiday_key = holiday_dict.get("_key")
|
|
end_date_epoch = holiday_dict.get("end_date")
|
|
is_recurring = holiday_dict.get("is_recurring", False)
|
|
|
|
if not end_date_epoch:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, holiday with key={holiday_key} has no end_date, skipping cleanup check.'
|
|
)
|
|
continue
|
|
|
|
# Check if period has already passed
|
|
if int(end_date_epoch) < int(current_time):
|
|
if is_recurring:
|
|
# For recurring holidays, group by pattern to keep templates
|
|
period_name = holiday_dict.get("period_name", "")
|
|
country_code = holiday_dict.get("country_code", "")
|
|
start_date_epoch = holiday_dict.get("start_date")
|
|
|
|
if start_date_epoch:
|
|
try:
|
|
start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc)
|
|
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
|
|
|
|
# Create pattern key
|
|
pattern_key = f"{period_name}|{country_code}|{start_dt.month:02d}-{start_dt.day:02d}|{end_dt.month:02d}-{end_dt.day:02d}"
|
|
|
|
if pattern_key not in recurring_by_pattern:
|
|
recurring_by_pattern[pattern_key] = []
|
|
recurring_by_pattern[pattern_key].append({
|
|
"key": holiday_key,
|
|
"time_created": holiday_dict.get("time_created", 0),
|
|
"holiday": holiday_dict
|
|
})
|
|
except Exception as e:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to parse dates for holiday key={holiday_key}, exception="{str(e)}", treating as non-recurring for cleanup.'
|
|
)
|
|
non_recurring_past.append(holiday_dict)
|
|
else:
|
|
# Missing start_date, treat as non-recurring
|
|
non_recurring_past.append(holiday_dict)
|
|
else:
|
|
# Non-recurring past holiday - mark for deletion
|
|
non_recurring_past.append(holiday_dict)
|
|
|
|
# Delete non-recurring past holidays via REST API
|
|
for holiday_dict in non_recurring_past:
|
|
holiday_key = holiday_dict.get("_key")
|
|
period_name = holiday_dict.get("period_name", "unknown")
|
|
try:
|
|
end_date_epoch = holiday_dict.get("end_date")
|
|
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
|
|
end_date_str = end_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
# Use REST API endpoint for deletion (enables auditing)
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/delete"
|
|
payload = {"_key": holiday_key}
|
|
|
|
response = requests.post(
|
|
target_url,
|
|
headers=header,
|
|
data=json.dumps(payload),
|
|
verify=False,
|
|
timeout=600
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
past_periods_deleted_count += 1
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, deleted past non-recurring bank holiday via REST API: key={holiday_key}, period_name="{period_name}", end_date={end_date_str}'
|
|
)
|
|
else:
|
|
past_periods_deletion_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete past non-recurring bank holiday via REST API: key={holiday_key}, period_name="{period_name}", status_code={response.status_code}, response={response.text}'
|
|
)
|
|
except Exception as e:
|
|
past_periods_deletion_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete past non-recurring bank holiday: key={holiday_key}, period_name="{period_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# For recurring holidays, keep the oldest one (template) for each pattern, delete others
|
|
for pattern_key, holidays_list in recurring_by_pattern.items():
|
|
if len(holidays_list) > 1:
|
|
# Sort by time_created (oldest first) - keep the first one as template
|
|
holidays_list.sort(key=lambda x: x.get("time_created", 0))
|
|
template = holidays_list[0]
|
|
duplicates = holidays_list[1:]
|
|
|
|
logging.debug(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, pattern="{pattern_key}" has {len(holidays_list)} past occurrences, keeping template (key={template["key"]}), deleting {len(duplicates)} duplicates.'
|
|
)
|
|
|
|
# Delete duplicate past periods via REST API
|
|
for duplicate in duplicates:
|
|
duplicate_key = duplicate["key"]
|
|
duplicate_holiday = duplicate["holiday"]
|
|
period_name = duplicate_holiday.get("period_name", "unknown")
|
|
try:
|
|
end_date_epoch = duplicate_holiday.get("end_date")
|
|
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
|
|
end_date_str = end_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
# Use REST API endpoint for deletion (enables auditing)
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/delete"
|
|
payload = {"_key": duplicate_key}
|
|
|
|
response = requests.post(
|
|
target_url,
|
|
headers=header,
|
|
data=json.dumps(payload),
|
|
verify=False,
|
|
timeout=600
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
past_periods_deleted_count += 1
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, deleted duplicate past recurring bank holiday via REST API: key={duplicate_key}, period_name="{period_name}", pattern="{pattern_key}", end_date={end_date_str}'
|
|
)
|
|
else:
|
|
past_periods_deletion_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete duplicate past recurring bank holiday via REST API: key={duplicate_key}, period_name="{period_name}", status_code={response.status_code}, response={response.text}'
|
|
)
|
|
except Exception as e:
|
|
past_periods_deletion_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to delete duplicate past recurring bank holiday: key={duplicate_key}, period_name="{period_name}", exception="{str(e)}"'
|
|
)
|
|
# If only one past occurrence, keep it as template (no deletion needed)
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, cleanup completed: {past_periods_deleted_count} past periods deleted, {past_periods_deletion_failures_count} deletion failures.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed during cleanup of past bank holidays, exception="{str(e)}"'
|
|
)
|
|
|
|
############################################################
|
|
# Step 2: Process recurring holidays and create future occurrences
|
|
# Ensure we have periods for current year + next year (year+1)
|
|
############################################################
|
|
|
|
# Get all recurring bank holidays (after cleanup)
|
|
query_recurring = json.dumps({"is_recurring": True})
|
|
recurring_holidays = collection.data.query(query=query_recurring)
|
|
|
|
if not recurring_holidays:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no recurring bank holidays found, skipping creation task.'
|
|
)
|
|
else:
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, found {len(recurring_holidays)} recurring bank holiday(s) to process.'
|
|
)
|
|
|
|
# Plan ahead for current year + next year (year+1)
|
|
# This ensures we always have periods for the current year (if not yet passed) and next year
|
|
years_to_check = [current_year, current_year + 1]
|
|
|
|
# Get all existing bank holidays to check for duplicates
|
|
all_existing_holidays = collection.data.query()
|
|
existing_periods_by_pattern = {}
|
|
|
|
# Group existing holidays by pattern (period_name + country_code + month/day)
|
|
for holiday in all_existing_holidays:
|
|
holiday_dict = dict(holiday)
|
|
period_name = holiday_dict.get("period_name", "")
|
|
country_code = holiday_dict.get("country_code", "")
|
|
start_date_epoch = holiday_dict.get("start_date")
|
|
end_date_epoch = holiday_dict.get("end_date")
|
|
|
|
if start_date_epoch and end_date_epoch:
|
|
start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc)
|
|
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
|
|
|
|
# Create a pattern key: period_name + country_code + month/day of start and end
|
|
pattern_key = f"{period_name}|{country_code}|{start_dt.month:02d}-{start_dt.day:02d}|{end_dt.month:02d}-{end_dt.day:02d}"
|
|
|
|
if pattern_key not in existing_periods_by_pattern:
|
|
existing_periods_by_pattern[pattern_key] = []
|
|
existing_periods_by_pattern[pattern_key].append({
|
|
"year": start_dt.year,
|
|
"record": holiday_dict
|
|
})
|
|
|
|
# Process each recurring holiday
|
|
for recurring_holiday in recurring_holidays:
|
|
recurring_holidays_processed_count += 1
|
|
holiday_dict = dict(recurring_holiday)
|
|
|
|
period_name = holiday_dict.get("period_name", "")
|
|
country_code = holiday_dict.get("country_code", "")
|
|
comment = holiday_dict.get("comment", "")
|
|
start_date_epoch = holiday_dict.get("start_date")
|
|
end_date_epoch = holiday_dict.get("end_date")
|
|
src_user = holiday_dict.get("src_user", "system")
|
|
|
|
if not start_date_epoch or not end_date_epoch:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recurring holiday with key={holiday_dict.get("_key")} has invalid dates, skipping.'
|
|
)
|
|
continue
|
|
|
|
# Parse original dates
|
|
try:
|
|
start_dt = datetime.datetime.fromtimestamp(start_date_epoch, tz=datetime.timezone.utc)
|
|
end_dt = datetime.datetime.fromtimestamp(end_date_epoch, tz=datetime.timezone.utc)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to parse dates for recurring holiday key={holiday_dict.get("_key")}, exception="{str(e)}"'
|
|
)
|
|
continue
|
|
|
|
# Extract month/day from original dates
|
|
start_month = start_dt.month
|
|
start_day = start_dt.day
|
|
start_hour = start_dt.hour
|
|
start_minute = start_dt.minute
|
|
|
|
end_month = end_dt.month
|
|
end_day = end_dt.day
|
|
end_hour = end_dt.hour
|
|
end_minute = end_dt.minute
|
|
|
|
# Create pattern key for this recurring holiday
|
|
pattern_key = f"{period_name}|{country_code}|{start_month:02d}-{start_day:02d}|{end_month:02d}-{end_day:02d}"
|
|
|
|
# Check which years already have this holiday
|
|
# We check both by year and by actual date range to be more robust
|
|
existing_years = set()
|
|
existing_date_ranges = {} # year -> list of (start_epoch, end_epoch) tuples
|
|
|
|
if pattern_key in existing_periods_by_pattern:
|
|
for existing_period in existing_periods_by_pattern[pattern_key]:
|
|
year = existing_period["year"]
|
|
existing_years.add(year)
|
|
record = existing_period["record"]
|
|
if year not in existing_date_ranges:
|
|
existing_date_ranges[year] = []
|
|
existing_date_ranges[year].append((
|
|
record.get("start_date"),
|
|
record.get("end_date")
|
|
))
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, processing recurring holiday: period_name="{period_name}", pattern_key="{pattern_key}", existing_years={sorted(existing_years)}'
|
|
)
|
|
|
|
# Create periods for missing years
|
|
for target_year in years_to_check:
|
|
# Check if year already exists
|
|
if target_year in existing_years:
|
|
# Double-check by verifying the date range matches
|
|
# This handles edge cases where same year might have been created manually
|
|
should_skip = False
|
|
if target_year in existing_date_ranges:
|
|
# Calculate what the date range should be for this year
|
|
if end_month < start_month or (end_month == start_month and end_day < start_day):
|
|
# Year-spanning: target_year to target_year+1
|
|
expected_start = self.safe_create_datetime(
|
|
target_year, start_month, start_day, start_hour, start_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
).timestamp()
|
|
expected_end = self.safe_create_datetime(
|
|
target_year + 1, end_month, end_day, end_hour, end_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
).timestamp()
|
|
else:
|
|
# Normal: both in target_year
|
|
expected_start = self.safe_create_datetime(
|
|
target_year, start_month, start_day, start_hour, start_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
).timestamp()
|
|
expected_end = self.safe_create_datetime(
|
|
target_year, end_month, end_day, end_hour, end_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
).timestamp()
|
|
|
|
# Check if any existing period matches this date range (within same day tolerance)
|
|
for existing_start, existing_end in existing_date_ranges[target_year]:
|
|
if existing_start and existing_end:
|
|
# Check if dates are on the same day (tolerance for time differences)
|
|
existing_start_dt = datetime.datetime.fromtimestamp(existing_start, tz=datetime.timezone.utc)
|
|
expected_start_dt = datetime.datetime.fromtimestamp(expected_start, tz=datetime.timezone.utc)
|
|
if (existing_start_dt.year == expected_start_dt.year and
|
|
existing_start_dt.month == expected_start_dt.month and
|
|
existing_start_dt.day == expected_start_dt.day):
|
|
should_skip = True
|
|
break
|
|
|
|
if should_skip:
|
|
logging.debug(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, period already exists for year={target_year}, period_name="{period_name}", skipping.'
|
|
)
|
|
continue
|
|
|
|
# Calculate start and end dates for target year
|
|
try:
|
|
# Handle year-spanning holidays (e.g., Dec 31 - Jan 1)
|
|
if end_month < start_month or (end_month == start_month and end_day < start_day):
|
|
# Holiday spans across years (e.g., Dec 31 - Jan 1)
|
|
# Start date is in target_year, end date is in target_year + 1
|
|
new_start_dt = self.safe_create_datetime(
|
|
target_year, start_month, start_day, start_hour, start_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
)
|
|
new_end_dt = self.safe_create_datetime(
|
|
target_year + 1, end_month, end_day, end_hour, end_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
)
|
|
else:
|
|
# Normal holiday within the same year
|
|
new_start_dt = self.safe_create_datetime(
|
|
target_year, start_month, start_day, start_hour, start_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
)
|
|
new_end_dt = self.safe_create_datetime(
|
|
target_year, end_month, end_day, end_hour, end_minute,
|
|
tzinfo=datetime.timezone.utc
|
|
)
|
|
|
|
# Convert to epoch timestamps
|
|
new_start_epoch = int(round(new_start_dt.timestamp()))
|
|
new_end_epoch = int(round(new_end_dt.timestamp()))
|
|
|
|
# Validate date range
|
|
if new_end_epoch <= new_start_epoch:
|
|
logging.warning(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, invalid date range for year={target_year}, period_name="{period_name}", skipping.'
|
|
)
|
|
continue
|
|
|
|
# Create new record via REST API (enables auditing and delegates complexity)
|
|
try:
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/bank_holidays/admin/create"
|
|
payload = {
|
|
"period_name": period_name,
|
|
"start_date": new_start_epoch,
|
|
"end_date": new_end_epoch,
|
|
"comment": comment,
|
|
"country_code": country_code,
|
|
"is_recurring": True, # Keep recurring flag
|
|
}
|
|
|
|
response = requests.post(
|
|
target_url,
|
|
headers=header,
|
|
data=json.dumps(payload),
|
|
verify=False,
|
|
timeout=600
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
response_data = response.json()
|
|
created_record = response_data.get("payload", {})
|
|
new_key = created_record.get("_key")
|
|
new_periods_created_count += 1
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, successfully created recurring bank holiday period via REST API: key={new_key}, period_name="{period_name}", year={target_year}, start_date={new_start_dt.strftime("%Y-%m-%d %H:%M")}, end_date={new_end_dt.strftime("%Y-%m-%d %H:%M")}'
|
|
)
|
|
elif response.status_code == 409:
|
|
# 409 Conflict means the period already exists (duplicate detection)
|
|
# This is expected behavior, not an error - log at debug level
|
|
logging.debug(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recurring bank holiday period already exists (duplicate detected): period_name="{period_name}", year={target_year}, status_code=409'
|
|
)
|
|
else:
|
|
periods_creation_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to create recurring bank holiday period via REST API for year={target_year}, period_name="{period_name}", status_code={response.status_code}, response={response.text}'
|
|
)
|
|
except Exception as e:
|
|
periods_creation_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to create recurring bank holiday period for year={target_year}, period_name="{period_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
periods_creation_failures_count += 1
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to calculate dates for year={target_year}, period_name="{period_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to process recurring bank holidays, exception="{str(e)}"'
|
|
)
|
|
|
|
# add to global_results_dict
|
|
global_results_dict[f"{task_name}"] = {
|
|
"recurring_holidays_processed_count": recurring_holidays_processed_count,
|
|
"new_periods_created_count": new_periods_created_count,
|
|
"periods_creation_failures_count": periods_creation_failures_count,
|
|
"past_periods_deleted_count": past_periods_deleted_count,
|
|
"past_periods_deletion_failures_count": past_periods_deletion_failures_count,
|
|
"result": f"{recurring_holidays_processed_count} recurring holidays processed, {new_periods_created_count} new periods created, {periods_creation_failures_count} creation failures, {past_periods_deleted_count} past periods deleted, {past_periods_deletion_failures_count} deletion failures",
|
|
}
|
|
|
|
logging.info(
|
|
f'instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# End
|
|
#
|
|
|
|
# yield the results
|
|
yield_record = {
|
|
"_time": time.time(),
|
|
"_raw": global_results_dict,
|
|
"results": global_results_dict,
|
|
}
|
|
|
|
yield yield_record
|
|
|
|
#
|
|
# END
|
|
#
|
|
|
|
# end general task
|
|
logging.info(
|
|
f"instance_id={instance_id}, trackmegeneralhealthmanager has terminated, total_run_time={round(time.time() - global_start, 3)}"
|
|
)
|
|
|
|
|
|
dispatch(HealthTracker, sys.argv, sys.stdin, sys.stdout, __name__)
|