You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4485 lines
219 KiB
4485 lines
219 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
# Standard library imports
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import uuid
|
|
import threading
|
|
import hashlib
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Logging imports
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
# Networking imports
|
|
import requests
|
|
from requests.structures import CaseInsensitiveDict
|
|
import urllib3
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
# splunk home
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
f"{splunkhome}/var/log/splunk/trackme_tracker_health.log",
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
GeneratingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
import splunklib.results as results
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_reqinfo,
|
|
trackme_register_tenant_object_summary,
|
|
trackme_delete_tenant_object_summary,
|
|
trackme_vtenant_account,
|
|
trackme_idx_for_tenant,
|
|
trackme_state_event,
|
|
trackme_register_tenant_component_summary,
|
|
trackme_handler_events,
|
|
trackme_manage_report_schedule,
|
|
trackme_get_version,
|
|
)
|
|
|
|
# import trackme licensing libs
|
|
from trackme_libs_licensing import trackme_check_license
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_report_update_enablement,
|
|
run_splunk_search,
|
|
trackme_gen_state,
|
|
)
|
|
|
|
# import trackme libs utils
|
|
from trackme_libs_utils import remove_leading_spaces, decode_unicode
|
|
|
|
# import trackme libs logical groups
|
|
from trackme_libs_logicalgroup import (
|
|
get_logical_groups_collection_records,
|
|
logical_group_remove_object_from_groups,
|
|
logical_group_delete_group_by_name,
|
|
)
|
|
|
|
# import TrackMe get data libs
|
|
from trackme_libs_get_data import (
|
|
get_full_kv_collection,
|
|
)
|
|
|
|
# import default vtenant account settings
|
|
from collections_data import vtenant_account_default
|
|
|
|
# import trackme libs sla
|
|
from trackme_libs_sla import trackme_sla_gen_metrics
|
|
|
|
# import trackme libs schema
|
|
from trackme_libs_schema import trackme_schema_format_version
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class HealthTracker(GeneratingCommand):
|
|
tenant_id = Option(
|
|
doc="""
|
|
**Syntax:** **tenant_id=****
|
|
**Description:** The tenant identifier.""",
|
|
require=True,
|
|
default=None,
|
|
)
|
|
|
|
get_acl = Option(
|
|
doc="""
|
|
**Syntax:** **get_acl=****
|
|
**Description:** Retrieve ACLs information for the tenant knowledge objects, disabled by default as this can generate more rest traffic and load.""",
|
|
require=False,
|
|
default=False,
|
|
validate=validators.Boolean(),
|
|
)
|
|
|
|
"""
|
|
Function to return a unique uuid which is used to trace performance run_time of each subtask.
|
|
"""
|
|
|
|
def get_uuid(self):
|
|
return str(uuid.uuid4())
|
|
|
|
def register_component_summary_async(
|
|
self, session_key, splunkd_uri, tenant_id, component
|
|
):
|
|
try:
|
|
summary_register_response = trackme_register_tenant_component_summary(
|
|
session_key,
|
|
splunkd_uri,
|
|
tenant_id,
|
|
component,
|
|
)
|
|
logging.debug(
|
|
f'function="trackme_register_tenant_component_summary", response="{json.dumps(summary_register_response, indent=2)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'failed to register the component summary with exception="{str(e)}"'
|
|
)
|
|
|
|
def generate(self, **kwargs):
|
|
|
|
# performance counter
|
|
start = time.time()
|
|
|
|
# set instance_id
|
|
instance_id = self.get_uuid()
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# Build header and target URL
|
|
headers = CaseInsensitiveDict()
|
|
headers["Authorization"] = f"Splunk {self._metadata.searchinfo.session_key}"
|
|
headers["Content-Type"] = "application/json"
|
|
|
|
# Create a requests session for better performance
|
|
session = requests.Session()
|
|
session.headers.update(headers)
|
|
|
|
###########################################################################
|
|
# Verify the Virtual Tenant account with privileges escalation
|
|
###########################################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_vtenant_accounts"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verifying the vtenant account'
|
|
)
|
|
|
|
try:
|
|
vtenant_account = trackme_vtenant_account(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
# target
|
|
url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/configuration/admin/maintain_vtenant_account"
|
|
|
|
# proceed
|
|
try:
|
|
response = session.post(
|
|
url,
|
|
data=json.dumps(
|
|
{"tenant_id": self.tenant_id, "force_create_missing": True}
|
|
),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
if response.status_code not in (200, 201, 204):
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verify vtenant account has failed, was this account deleted by mistake? response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
)
|
|
raise Exception(f'verify vtenant account has failed, was this account deleted by mistake? response.status_code="{response.status_code}", response.text="{response.text}"')
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, account was verified successfully'
|
|
)
|
|
response_json = response.json()
|
|
|
|
# fetch the vtenant account again
|
|
vtenant_account = trackme_vtenant_account(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, verify vtenant account has failed, exception="{str(e)}"'
|
|
)
|
|
raise Exception(f'verify vtenant account has failed, exception="{str(e)}"')
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
#
|
|
#
|
|
|
|
# get the target index
|
|
tenant_indexes = trackme_idx_for_tenant(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
)
|
|
|
|
# get global indexes
|
|
global_indexes = {
|
|
"trackme_summary_idx": reqinfo["trackme_conf"]["index_settings"][
|
|
"trackme_summary_idx"
|
|
],
|
|
"trackme_audit_idx": reqinfo["trackme_conf"]["index_settings"][
|
|
"trackme_audit_idx"
|
|
],
|
|
"trackme_metric_idx": reqinfo["trackme_conf"]["index_settings"][
|
|
"trackme_metric_idx"
|
|
],
|
|
"trackme_notable_idx": reqinfo["trackme_conf"]["index_settings"][
|
|
"trackme_notable_idx"
|
|
],
|
|
}
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, global_indexes="{json.dumps(global_indexes, indent=2)}"'
|
|
)
|
|
|
|
# get trackme release
|
|
trackme_version = trackme_get_version(
|
|
self.service,
|
|
log_context={
|
|
"context_prefix": f'tenant_id="{self.tenant_id}", instance_id={instance_id}'
|
|
}
|
|
)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, running trackme version="{trackme_version}"'
|
|
)
|
|
|
|
# set the schema_version_required
|
|
schema_version_required = trackme_schema_format_version(trackme_version)
|
|
|
|
# Get the session key
|
|
session_key = self._metadata.searchinfo.session_key
|
|
|
|
# Add the session_key to the reqinfo
|
|
reqinfo["session_key"] = session_key
|
|
|
|
# report name for logging purposes
|
|
report_name = f"trackme_health_tracker_tenant_{self.tenant_id}"
|
|
|
|
# Data collection
|
|
collection_name = "kv_trackme_virtual_tenants"
|
|
collection = self.service.kvstore[collection_name]
|
|
|
|
# Get the tenant KVrecord
|
|
query_string = {
|
|
"tenant_id": self.tenant_id,
|
|
}
|
|
vtenant_record = collection.data.query(query=json.dumps(query_string))[0]
|
|
|
|
#
|
|
# check license state
|
|
#
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_licensing"
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
try:
|
|
check_license = trackme_check_license(
|
|
reqinfo["server_rest_uri"], session_key
|
|
)
|
|
license_is_valid = check_license.get("license_is_valid")
|
|
license_subscription_class = check_license.get("license_subscription_class")
|
|
license_active_tenants = check_license.get("license_active_tenants")
|
|
license_active_tenants_list = check_license.get(
|
|
"license_active_tenants_list"
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function check_license called, task_instance_id={task_instance_id}, license_is_valid="{license_is_valid}", license_subscription_class="{license_subscription_class}", license_active_tenants="{license_active_tenants}", license_active_tenants_list="{license_active_tenants_list}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
license_is_valid = 2
|
|
license_subscription_class = "unlimited"
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function check_license has failed, exception="{str(e)}"'
|
|
)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# check tenants indexes settings:
|
|
# - retrieve the configured indexes for the tenant
|
|
# - retrieve via a REST call to splunkd the list of declared indexes on the search head
|
|
# - if any of the tenant defines indexes are not declared on the search head, update the tenant indexes settings to fallback to TrackMe default indexes and log the issue
|
|
#
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_tenants_indexes_settings"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
def get_indexes_by_datatype(datatype=None):
|
|
"""Retrieve indexes from the search head by datatype.
|
|
|
|
Args:
|
|
datatype (str, optional): The datatype to filter by (e.g. 'metric').
|
|
If None, retrieves all indexes.
|
|
|
|
Returns:
|
|
dict: Dictionary of index names and their datatypes
|
|
"""
|
|
url = f"{reqinfo['server_rest_uri']}/services/data/indexes?output_mode=json&count=0"
|
|
if datatype:
|
|
url += f"&datatype={datatype}"
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, verify=False, timeout=600)
|
|
if response.status_code == 200:
|
|
indexes_raw = response.json().get("entry", [])
|
|
for index in indexes_raw:
|
|
if isinstance(index, dict):
|
|
index_name = index.get("name")
|
|
if index_name:
|
|
declared_indexes_dict[index_name] = {
|
|
"datatype": index.get("content", {}).get(
|
|
"datatype", ""
|
|
)
|
|
}
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, declared_indexes="{json.dumps(declared_indexes_dict, indent=2)}"'
|
|
)
|
|
else:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to retrieve indexes list, status code: {response.status_code}'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, could not retrieve the list of declared indexes on the search head, exception="{str(e)}"'
|
|
)
|
|
|
|
def get_fallback_indexes(index_category=None):
|
|
"""Retrieve fallback indexes from the search head.
|
|
|
|
Returns:
|
|
dict: Dictionary of fallback indexes
|
|
"""
|
|
|
|
fallback_indexes = {
|
|
"trackme_summary_idx": "trackme_summary",
|
|
"trackme_audit_idx": "trackme_audit",
|
|
"trackme_metric_idx": "trackme_metrics",
|
|
"trackme_notable_idx": "trackme_notable",
|
|
}
|
|
|
|
if index_category:
|
|
return fallback_indexes.get(index_category, None)
|
|
else:
|
|
return fallback_indexes
|
|
|
|
# get the tenant indexes settings
|
|
tenant_indexes_settings = trackme_idx_for_tenant(
|
|
session_key,
|
|
reqinfo["server_rest_uri"],
|
|
self.tenant_id,
|
|
)
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_indexes_settings="{json.dumps(tenant_indexes_settings, indent=2)}"'
|
|
)
|
|
|
|
""" Example of tenant_indexes_settings:
|
|
{
|
|
"trackme_summary_idx": "trackme_summary",
|
|
"trackme_audit_idx": "trackme_audit",
|
|
"trackme_metric_idx": "trackme_metrics",
|
|
"trackme_notable_idx": "trackme_notable"
|
|
}
|
|
"""
|
|
|
|
# check if tenant_indexes_settings is set to global
|
|
tenant_indexes_uses_global_indexes = False
|
|
|
|
if tenant_indexes_settings == "global":
|
|
tenant_indexes_settings = global_indexes
|
|
tenant_indexes_uses_global_indexes = True
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_indexes_settings set to global, will check the search head for declared indexes.'
|
|
)
|
|
|
|
# process
|
|
declared_indexes_dict = {}
|
|
|
|
# Get all indexes (events)
|
|
get_indexes_by_datatype()
|
|
|
|
# Get metrics indexes
|
|
get_indexes_by_datatype(datatype="metric")
|
|
|
|
# only proceed if we have declared indexes
|
|
if not declared_indexes_dict:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no declared indexes found, skipping tenant indexes settings check.'
|
|
)
|
|
return
|
|
|
|
# for each index in the tenant indexes settings, check if it is declared on the search head
|
|
# we also want to check for trackme_metrics_idx that the datatype is set to "metric"
|
|
# if not, we will force update the tenant indexes settings to fallback to TrackMe default indexes
|
|
|
|
invalid_indexes_settings_detected = False
|
|
|
|
# process the tenant indexes settings
|
|
for index_category, index_value in tenant_indexes_settings.items():
|
|
if not isinstance(index_value, str):
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}" has invalid index value type: {type(index_value)}'
|
|
)
|
|
invalid_indexes_settings_detected = True
|
|
# update the tenant indexes settings for the current index_category
|
|
tenant_indexes_settings[index_category] = get_fallback_indexes(
|
|
index_category
|
|
)
|
|
continue
|
|
|
|
if index_value not in declared_indexes_dict:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}", index_value="{index_value}" is not declared on the search head, this is an invalid configuration, we will force update the tenant indexes settings to fallback to TrackMe default indexes. Please ensure to define indexes in the search head tier before attempting to configure your tenant indexes settings.'
|
|
)
|
|
invalid_indexes_settings_detected = True
|
|
# update the tenant indexes settings for the current index_category
|
|
tenant_indexes_settings[index_category] = get_fallback_indexes(
|
|
index_category
|
|
)
|
|
continue
|
|
|
|
elif index_category == "trackme_metrics_idx":
|
|
index_info = declared_indexes_dict.get(index_value, {})
|
|
if index_info.get("datatype") != "metric":
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, index_category="{index_category}", index_value="{index_value}" is not configured as a metric index, this is an invalid configuration, we will force update the tenant indexes settings to fallback to TrackMe default indexes.'
|
|
)
|
|
invalid_indexes_settings_detected = True
|
|
# update the tenant indexes settings for the current index_category
|
|
tenant_indexes_settings[index_category] = get_fallback_indexes(
|
|
index_category
|
|
)
|
|
continue
|
|
|
|
if not invalid_indexes_settings_detected:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no invalid indexes settings detected, nothing to do.'
|
|
)
|
|
else:
|
|
# If we were using global indexes and found issues, we need to fallback to default indexes
|
|
if tenant_indexes_uses_global_indexes:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, issues detected with global indexes, falling back to default indexes.'
|
|
)
|
|
tenant_indexes_settings = get_fallback_indexes()
|
|
|
|
# fix the tenant indexes settings
|
|
vtenant_record["tenant_idx_settings"] = json.dumps(
|
|
tenant_indexes_settings, indent=2
|
|
)
|
|
try:
|
|
self.service.kvstore["kv_trackme_virtual_tenants"].data.update(
|
|
str(vtenant_record["_key"]), json.dumps(vtenant_record)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully, new tenant_idx_settings="{json.dumps(tenant_indexes_settings, indent=2)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
##################################################################################
|
|
# Global system verifications: verify that the relevant scheduled jobs are enabled
|
|
##################################################################################
|
|
|
|
# These jobs are not tenant specifics, however we use the health tracker to ensure that
|
|
# these are effectively enabled when at least one tenant has been created and is active
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_global_trackers_enablement"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
savedsearch_names = [
|
|
"trackme_ack_expiration_tracker",
|
|
"trackme_maintenance_mode_tracker",
|
|
"trackme_backup_scheduler",
|
|
"trackme_general_health_manager",
|
|
]
|
|
|
|
for savedsearch_name in savedsearch_names:
|
|
# check ack expiration tracker
|
|
update_properties_required = False
|
|
|
|
try:
|
|
mysavedsearch = self.service.saved_searches[savedsearch_name]
|
|
current_disabled = int(mysavedsearch["disabled"])
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, verifying savedsearch="{mysavedsearch.name}", disabled="{current_disabled}"'
|
|
)
|
|
|
|
if current_disabled == 1:
|
|
update_properties_required = True
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global configuration verification, could not retrieve the status for {savedsearch_name}'
|
|
)
|
|
|
|
if update_properties_required:
|
|
try:
|
|
action = trackme_report_update_enablement(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
savedsearch_name,
|
|
"enable",
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, enabling savedsearch="{savedsearch_name}", result="{action}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, global config check, an exception was encountered while trying to enable savedsearch="{savedsearch_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
##################################################################################
|
|
# Optimize: enable or disable the schedule for utilities depending on the tenant
|
|
# settings, and conditions
|
|
##################################################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "optimize_tenant_scheduled_reports"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# Define the valid components
|
|
valid_components = {"dsm", "dhm", "mhm", "flx", "wlk", "fqm"}
|
|
|
|
def manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, feature_name
|
|
):
|
|
"""
|
|
Helper function to manage saved search scheduling based on feature enablement.
|
|
|
|
Args:
|
|
savedsearch_names: List of saved search names to manage
|
|
feature_enabled: Boolean indicating if the feature should be enabled
|
|
feature_name: String name of the feature for logging purposes
|
|
"""
|
|
for savedsearch_name in savedsearch_names:
|
|
# get the status of the savedsearch
|
|
savedsearch_properties, savedsearch_acl = (
|
|
trackme_manage_report_schedule(
|
|
logging,
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
savedsearch_name,
|
|
action="status",
|
|
)
|
|
)
|
|
|
|
# log
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, savedsearch="{savedsearch_name}", savedsearch_properties="{json.dumps(savedsearch_properties, indent=2)}", savedsearch_acl="{json.dumps(savedsearch_acl, indent=2)}"'
|
|
)
|
|
|
|
# get the is_scheduled status
|
|
is_scheduled = int(savedsearch_properties.get("is_scheduled", 0))
|
|
|
|
# avoid failing to schedule the savedsearch if any of the following is missing or equal to None:
|
|
# dispatch.earliest_time
|
|
# dispatch.latest_time
|
|
# cron_schedule
|
|
# schedule_window
|
|
|
|
# outliers_mltrain:
|
|
# "cron_schedule": "*/60 * * * *",
|
|
# "dispatch.earliest_time": "-5m",
|
|
# "dispatch.latest_time": "now",
|
|
# "schedule_window": "5",
|
|
|
|
# outliers_mlmonitor:
|
|
# "cron_schedule": "*/20 * * * *",
|
|
# "dispatch.earliest_time": "-5m",
|
|
# "dispatch.latest_time": "now",
|
|
# "schedule_window": "5",
|
|
|
|
# data_sampling:
|
|
# "cron_schedule": "*/20 * * * *",
|
|
# "dispatch.earliest_time": "-24h",
|
|
# "dispatch.latest_time": "-4h",
|
|
# "schedule_window": "5",
|
|
|
|
# adaptive_delay:
|
|
# "cron_schedule": "*/20 * * * *",
|
|
# "dispatch.earliest_time": "-5m",
|
|
# "dispatch.latest_time": "now",
|
|
# "schedule_window": "5",
|
|
|
|
# delayed_inspector:
|
|
# "cron_schedule": "*/20 * * * *",
|
|
# "dispatch.earliest_time": "-5m",
|
|
# "dispatch.latest_time": "now",
|
|
# "schedule_window": "5",
|
|
|
|
# if any of these parameters is missing in the savedsearch properties, we need to add them
|
|
if "dispatch.earliest_time" not in savedsearch_properties or savedsearch_properties.get("dispatch.earliest_time") in (None, 'None', ''):
|
|
if "outliers_mltrain" in savedsearch_name:
|
|
savedsearch_properties["dispatch.earliest_time"] = "-5m"
|
|
elif "outliers_mlmonitor" in savedsearch_name:
|
|
savedsearch_properties["dispatch.earliest_time"] = "-5m"
|
|
elif "data_sampling" in savedsearch_name:
|
|
savedsearch_properties["dispatch.earliest_time"] = "-24h"
|
|
elif "adaptive_delay" in savedsearch_name:
|
|
savedsearch_properties["dispatch.earliest_time"] = "-5m"
|
|
elif "delayed_entities_inspector" in savedsearch_name:
|
|
savedsearch_properties["dispatch.earliest_time"] = "-5m"
|
|
else:
|
|
savedsearch_properties["dispatch.earliest_time"] = "-5m"
|
|
|
|
if "dispatch.latest_time" not in savedsearch_properties or savedsearch_properties.get("dispatch.latest_time") in (None, 'None', ''):
|
|
if "outliers_mltrain" in savedsearch_name:
|
|
savedsearch_properties["dispatch.latest_time"] = "now"
|
|
elif "outliers_mlmonitor" in savedsearch_name:
|
|
savedsearch_properties["dispatch.latest_time"] = "now"
|
|
elif "data_sampling" in savedsearch_name:
|
|
savedsearch_properties["dispatch.latest_time"] = "-4h"
|
|
elif "adaptive_delay" in savedsearch_name:
|
|
savedsearch_properties["dispatch.latest_time"] = "now"
|
|
elif "delayed_entities_inspector" in savedsearch_name:
|
|
savedsearch_properties["dispatch.latest_time"] = "now"
|
|
else:
|
|
savedsearch_properties["dispatch.latest_time"] = "now"
|
|
|
|
if "cron_schedule" not in savedsearch_properties or savedsearch_properties.get("cron_schedule") in (None, 'None', ''):
|
|
if "outliers_mltrain" in savedsearch_name:
|
|
savedsearch_properties["cron_schedule"] = "0 22-23,0-6 * * *"
|
|
elif "outliers_mlmonitor" in savedsearch_name:
|
|
savedsearch_properties["cron_schedule"] = "*/20 * * * *"
|
|
elif "data_sampling" in savedsearch_name:
|
|
savedsearch_properties["cron_schedule"] = "*/20 22-23,0-6 * * *"
|
|
elif "adaptive_delay" in savedsearch_name:
|
|
savedsearch_properties["cron_schedule"] = "*/20 * * * *"
|
|
elif "delayed_entities_inspector" in savedsearch_name:
|
|
savedsearch_properties["cron_schedule"] = "*/20 * * * *"
|
|
else:
|
|
savedsearch_properties["cron_schedule"] = "*/5 * * * *"
|
|
|
|
if "schedule_window" not in savedsearch_properties or savedsearch_properties.get("schedule_window") in (None, 'None', ''):
|
|
savedsearch_properties["schedule_window"] = "5"
|
|
|
|
# act
|
|
if is_scheduled == 1 and feature_enabled == False:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", disabling savedsearch.'
|
|
)
|
|
try:
|
|
savedsearch_properties, savedsearch_acl = (
|
|
trackme_manage_report_schedule(
|
|
logging,
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
savedsearch_name,
|
|
input_report_properties=savedsearch_properties,
|
|
action="disable",
|
|
)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch updated successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to update savedsearch, exception="{str(e)}"'
|
|
)
|
|
|
|
elif is_scheduled == 0 and feature_enabled == True:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", enabling savedsearch.'
|
|
)
|
|
try:
|
|
savedsearch_properties, savedsearch_acl = (
|
|
trackme_manage_report_schedule(
|
|
logging,
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
savedsearch_name,
|
|
input_report_properties=savedsearch_properties,
|
|
action="enable",
|
|
)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", savedsearch updated successfully, properties="{json.dumps(savedsearch_properties, indent=2)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", an exception was encountered while trying to update savedsearch, exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", savedsearch="{savedsearch_name}", is_scheduled="{is_scheduled}", {feature_name}_feature_enabled="{feature_enabled}", nothing to do.'
|
|
)
|
|
|
|
# Process except for replica tenants
|
|
try:
|
|
tenant_replica = int(vtenant_record.get("tenant_replica", 0))
|
|
except Exception as e:
|
|
tenant_replica = 0
|
|
|
|
if tenant_replica == 1:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, detected replica tenant by name pattern, setting tenant_replica=1'
|
|
)
|
|
|
|
# Log replica tenant status for debugging
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_replica="{tenant_replica}", will_process="{tenant_replica == 0}"'
|
|
)
|
|
|
|
if tenant_replica == 0: # only process non-replica tenants (value: 0)
|
|
|
|
for valid_component in valid_components:
|
|
valid_component_is_enabled = int(
|
|
vtenant_record.get(f"tenant_{valid_component}_enabled", 0)
|
|
)
|
|
|
|
if valid_component_is_enabled == 1:
|
|
|
|
# only for dsm/dhm/flx/wlk
|
|
if valid_component in ("dsm", "dhm", "flx", "wlk", "fqm"):
|
|
|
|
#
|
|
# ML Outliers
|
|
#
|
|
|
|
try:
|
|
|
|
savedsearch_names = [
|
|
f"trackme_{valid_component}_outliers_mltrain_tracker_tenant_{self.tenant_id}",
|
|
f"trackme_{valid_component}_outliers_mlmonitor_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
# Default to True
|
|
feature_enabled = True
|
|
|
|
# Construct the key dynamically
|
|
key = f"mloutliers_{valid_component}"
|
|
|
|
# Check if the component is valid and handle exceptions
|
|
if valid_component in valid_components:
|
|
try:
|
|
feature_enablement = int(vtenant_account.get(key, 1))
|
|
if feature_enablement == 0:
|
|
feature_enabled = False
|
|
except (ValueError, TypeError):
|
|
feature_enabled = True
|
|
else:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
|
|
)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "outliers"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# Sampling (dsm only)
|
|
#
|
|
|
|
try:
|
|
|
|
if valid_component == "dsm":
|
|
|
|
savedsearch_names = [
|
|
f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
# Default to True
|
|
feature_enabled = True
|
|
|
|
# Construct the key dynamically
|
|
key = f"sampling"
|
|
|
|
# Check if the component is valid and handle exceptions
|
|
if valid_component in valid_components:
|
|
try:
|
|
feature_enablement = int(vtenant_account.get(key, 1))
|
|
if feature_enablement == 0:
|
|
feature_enabled = False
|
|
except (ValueError, TypeError):
|
|
feature_enabled = True
|
|
else:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
|
|
)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "sampling"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# Adaptive delay (dsm only)
|
|
#
|
|
|
|
try:
|
|
|
|
if valid_component == "dsm":
|
|
|
|
savedsearch_names = [
|
|
f"trackme_dsm_adaptive_delay_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
# Default to True
|
|
feature_enabled = True
|
|
|
|
# Construct the key dynamically
|
|
key = f"adaptive_delay"
|
|
|
|
# Check if the component is valid and handle exceptions
|
|
if valid_component in valid_components:
|
|
try:
|
|
feature_enablement = int(vtenant_account.get(key, 1))
|
|
if feature_enablement == 0:
|
|
feature_enabled = False
|
|
except (ValueError, TypeError):
|
|
feature_enabled = True
|
|
else:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
|
|
)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "adaptive_delay"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# Delayed inspector (dsm/dhm only)
|
|
#
|
|
|
|
try:
|
|
|
|
if valid_component in ("dsm", "dhm"):
|
|
|
|
savedsearch_names = [
|
|
f"trackme_{valid_component}_delayed_entities_inspector_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
# Default to True
|
|
feature_enabled = True
|
|
|
|
# Construct the key dynamically
|
|
keys = [
|
|
"splk_feeds_delayed_inspector_24hours_range_min_sec",
|
|
"splk_feeds_delayed_inspector_7days_range_min_sec",
|
|
"splk_feeds_delayed_inspector_until_disabled_range_min_sec",
|
|
]
|
|
|
|
# Check if the component is valid and handle exceptions (all keys must be set to 0 for the feature to be disabled)
|
|
if valid_component in valid_components:
|
|
try:
|
|
feature_enabled = True # Default to enabled
|
|
for key in keys:
|
|
feature_enablement = int(
|
|
vtenant_account.get(key, 1)
|
|
)
|
|
if feature_enablement != 0:
|
|
# If any key is not 0, the feature should be enabled
|
|
feature_enabled = True
|
|
break
|
|
else:
|
|
# If we get here, all keys were 0, so disable the feature
|
|
feature_enabled = False
|
|
except (ValueError, TypeError):
|
|
feature_enabled = True
|
|
else:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}" is not valid, valid components are {valid_components}'
|
|
)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "delayed_inspector"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# Priority policies: depends on if we have content in the KVstore collection
|
|
#
|
|
|
|
try:
|
|
|
|
savedsearch_names = [
|
|
f"trackme_{valid_component}_priority_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
priority_collection_name = f"kv_trackme_{valid_component}_priority_policies_tenant_{self.tenant_id}"
|
|
priority_collection = self.service.kvstore[priority_collection_name]
|
|
(
|
|
priority_records,
|
|
priority_collection_keys,
|
|
priority_collection_dict,
|
|
) = get_full_kv_collection(
|
|
priority_collection, priority_collection_name
|
|
)
|
|
|
|
# check if we have content in the collection
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", priority_collection_name="{priority_collection_name}", priority_records_count="{len(priority_records)}"'
|
|
)
|
|
feature_enabled = bool(priority_records)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "priority_policies"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# Tags policies: depends on if we have content in the KVstore collection
|
|
#
|
|
|
|
try:
|
|
|
|
savedsearch_names = [
|
|
f"trackme_{valid_component}_tags_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
tags_collection_name = f"kv_trackme_{valid_component}_tags_policies_tenant_{self.tenant_id}"
|
|
tags_collection = self.service.kvstore[tags_collection_name]
|
|
tags_records, tags_collection_keys, tags_collection_dict = (
|
|
get_full_kv_collection(tags_collection, tags_collection_name)
|
|
)
|
|
|
|
# check if we have content in the collection
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", tags_collection_name="{tags_collection_name}", tags_records_count="{len(tags_records)}"'
|
|
)
|
|
feature_enabled = bool(tags_records)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "tags_policies"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# SLA policies: depends on if we have content in the KVstore collection
|
|
#
|
|
|
|
try:
|
|
|
|
savedsearch_names = [
|
|
f"trackme_{valid_component}_sla_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
sla_collection_name = f"kv_trackme_{valid_component}_sla_policies_tenant_{self.tenant_id}"
|
|
sla_collection = self.service.kvstore[sla_collection_name]
|
|
sla_records, sla_collection_keys, sla_collection_dict = (
|
|
get_full_kv_collection(sla_collection, sla_collection_name)
|
|
)
|
|
|
|
# check if we have content in the collection
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", tags_collection_name="{tags_collection_name}", tags_records_count="{len(sla_records)}"'
|
|
)
|
|
feature_enabled = bool(tags_records)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "sla_policies"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# Shared Elastic Tracker: depends on if we have content in the KVstore collection (dsm only)
|
|
#
|
|
|
|
try:
|
|
|
|
if valid_component == "dsm":
|
|
|
|
savedsearch_names = [
|
|
f"trackme_dsm_shared_elastic_tracker_tenant_{self.tenant_id}",
|
|
]
|
|
|
|
shared_elastic_collection_name = (
|
|
f"kv_trackme_dsm_elastic_shared_tenant_{self.tenant_id}"
|
|
)
|
|
shared_elastic_collection = self.service.kvstore[
|
|
shared_elastic_collection_name
|
|
]
|
|
(
|
|
shared_elastic_records,
|
|
shared_elastic_collection_keys,
|
|
shared_elastic_collection_dict,
|
|
) = get_full_kv_collection(
|
|
shared_elastic_collection, shared_elastic_collection_name
|
|
)
|
|
|
|
# check if we have content in the collection
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", shared_elastic_collection_name="{shared_elastic_collection_name}", shared_elastic_records_count="{len(shared_elastic_records)}"'
|
|
)
|
|
feature_enabled = bool(shared_elastic_records)
|
|
|
|
manage_savedsearch_schedule(
|
|
savedsearch_names, feature_enabled, "shared_elastic"
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, component="{valid_component}", an exception was encountered while trying to manage savedsearch schedule, exception="{str(e)}"'
|
|
)
|
|
else:
|
|
# Skip processing for replica tenants
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, skipping replica tenant processing, tenant_replica="{tenant_replica}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
##################################################################################
|
|
# Replica orchestrator
|
|
##################################################################################
|
|
|
|
# This job scheduled will automatically be enabled if we detect that at least one
|
|
# replica tracker has been created
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "replica_orchestrator"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# Try to get the current definition
|
|
try:
|
|
tenant_replica_objects = vtenant_record.get("tenant_replica_objects")
|
|
|
|
# logging debug
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_replica_objects="{tenant_replica_objects}"'
|
|
)
|
|
except Exception as e:
|
|
tenant_replica_objects = None
|
|
|
|
# only run if we have a proper replica object
|
|
if tenant_replica_objects:
|
|
savedsearch_names = [
|
|
"trackme_replica_executor",
|
|
]
|
|
|
|
for savedsearch_name in savedsearch_names:
|
|
# check
|
|
update_properties_required = False
|
|
|
|
try:
|
|
mysavedsearch = self.service.saved_searches[savedsearch_name]
|
|
current_disabled = int(mysavedsearch["disabled"])
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, verifying savedsearch="{mysavedsearch.name}", disabled="{current_disabled}"'
|
|
)
|
|
|
|
if current_disabled == 1:
|
|
update_properties_required = True
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica configuration verification, could not retrieve the status for {savedsearch_name}'
|
|
)
|
|
|
|
if update_properties_required:
|
|
try:
|
|
action = trackme_report_update_enablement(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
savedsearch_name,
|
|
"enable",
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, enabling savedsearch="{savedsearch_name}", result="{action}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, replica config check, an exception was encountered while trying to enable savedsearch="{savedsearch_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
###########################################################################
|
|
# schema update and migration: detect and migrate Virtual Tenants if needed
|
|
###########################################################################
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "schema_upgrade"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
from trackme_libs_schema import (
|
|
trackme_schema_get_version,
|
|
trackme_schema_update_version,
|
|
trackme_schema_upgrade_2009,
|
|
trackme_schema_upgrade_2015,
|
|
trackme_schema_upgrade_2016,
|
|
trackme_schema_upgrade_2020,
|
|
trackme_schema_upgrade_2026,
|
|
trackme_schema_upgrade_2034,
|
|
trackme_schema_upgrade_2034_least_privileges,
|
|
trackme_schema_upgrade_2036,
|
|
trackme_schema_upgrade_2038,
|
|
trackme_schema_upgrade_2043,
|
|
trackme_schema_upgrade_2044,
|
|
trackme_schema_upgrade_2045,
|
|
trackme_schema_upgrade_2054,
|
|
trackme_schema_upgrade_2064,
|
|
trackme_schema_upgrade_2067,
|
|
trackme_schema_upgrade_2070,
|
|
trackme_schema_upgrade_2071,
|
|
trackme_schema_upgrade_2072,
|
|
trackme_schema_upgrade_2075,
|
|
trackme_schema_upgrade_2078,
|
|
trackme_schema_upgrade_2083,
|
|
trackme_schema_upgrade_2084,
|
|
trackme_schema_upgrade_2087,
|
|
trackme_schema_upgrade_2089,
|
|
trackme_schema_upgrade_2090,
|
|
trackme_schema_upgrade_2091,
|
|
trackme_schema_upgrade_2094,
|
|
trackme_schema_upgrade_2095,
|
|
trackme_schema_upgrade_2096,
|
|
trackme_schema_upgrade_2097,
|
|
trackme_schema_upgrade_2098,
|
|
trackme_schema_upgrade_2099,
|
|
trackme_schema_upgrade_2100,
|
|
trackme_schema_upgrade_2101,
|
|
trackme_schema_upgrade_2102,
|
|
trackme_schema_upgrade_2104,
|
|
trackme_schema_upgrade_2105,
|
|
trackme_schema_upgrade_2107,
|
|
trackme_schema_upgrade_2108,
|
|
trackme_schema_upgrade_2109,
|
|
trackme_schema_upgrade_2110,
|
|
trackme_schema_upgrade_2111,
|
|
trackme_schema_upgrade_2116,
|
|
trackme_schema_upgrade_2118,
|
|
trackme_schema_upgrade_2119,
|
|
trackme_schema_upgrade_2121,
|
|
trackme_schema_upgrade_2122,
|
|
trackme_schema_upgrade_2123,
|
|
trackme_schema_upgrade_2126,
|
|
trackme_schema_upgrade_2128,
|
|
trackme_schema_upgrade_2130,
|
|
trackme_schema_upgrade_2131,
|
|
trackme_schema_upgrade_2132,
|
|
trackme_schema_upgrade_2300,
|
|
trackme_schema_upgrade_2304,
|
|
trackme_schema_upgrade_2305,
|
|
)
|
|
|
|
# Define a mapping between schema versions and their upgrade functions
|
|
schema_upgrades = [
|
|
(2009, trackme_schema_upgrade_2009),
|
|
(2015, trackme_schema_upgrade_2015),
|
|
(2016, trackme_schema_upgrade_2016),
|
|
(2020, trackme_schema_upgrade_2020),
|
|
(2026, trackme_schema_upgrade_2026),
|
|
(2034, trackme_schema_upgrade_2034),
|
|
(2034, trackme_schema_upgrade_2034_least_privileges),
|
|
(2036, trackme_schema_upgrade_2036),
|
|
(2038, trackme_schema_upgrade_2038),
|
|
(2043, trackme_schema_upgrade_2043),
|
|
(2043, trackme_schema_upgrade_2044),
|
|
(2045, trackme_schema_upgrade_2045),
|
|
(2054, trackme_schema_upgrade_2054),
|
|
(2064, trackme_schema_upgrade_2064),
|
|
(2067, trackme_schema_upgrade_2067),
|
|
(2070, trackme_schema_upgrade_2070),
|
|
(2071, trackme_schema_upgrade_2071),
|
|
(2072, trackme_schema_upgrade_2072),
|
|
(2075, trackme_schema_upgrade_2075),
|
|
(2078, trackme_schema_upgrade_2078),
|
|
(2083, trackme_schema_upgrade_2083),
|
|
(2084, trackme_schema_upgrade_2084),
|
|
(2087, trackme_schema_upgrade_2087),
|
|
(2089, trackme_schema_upgrade_2089),
|
|
(2090, trackme_schema_upgrade_2090),
|
|
(2091, trackme_schema_upgrade_2091),
|
|
(2094, trackme_schema_upgrade_2094),
|
|
(2095, trackme_schema_upgrade_2095),
|
|
(2096, trackme_schema_upgrade_2096),
|
|
(2097, trackme_schema_upgrade_2097),
|
|
(2098, trackme_schema_upgrade_2098),
|
|
(2099, trackme_schema_upgrade_2099),
|
|
(2100, trackme_schema_upgrade_2100),
|
|
(2101, trackme_schema_upgrade_2101),
|
|
(2102, trackme_schema_upgrade_2102),
|
|
(2104, trackme_schema_upgrade_2104),
|
|
(2105, trackme_schema_upgrade_2105),
|
|
(2107, trackme_schema_upgrade_2107),
|
|
(2108, trackme_schema_upgrade_2108),
|
|
(2109, trackme_schema_upgrade_2109),
|
|
(2110, trackme_schema_upgrade_2110),
|
|
(2111, trackme_schema_upgrade_2111),
|
|
(2116, trackme_schema_upgrade_2116),
|
|
(2118, trackme_schema_upgrade_2118),
|
|
(2119, trackme_schema_upgrade_2119),
|
|
(2121, trackme_schema_upgrade_2121),
|
|
(2122, trackme_schema_upgrade_2122),
|
|
(2123, trackme_schema_upgrade_2123),
|
|
(2126, trackme_schema_upgrade_2126),
|
|
(2128, trackme_schema_upgrade_2128),
|
|
(2130, trackme_schema_upgrade_2130),
|
|
(2131, trackme_schema_upgrade_2131),
|
|
(2132, trackme_schema_upgrade_2132),
|
|
(2300, trackme_schema_upgrade_2300),
|
|
(2304, trackme_schema_upgrade_2304),
|
|
(2305, trackme_schema_upgrade_2305),
|
|
]
|
|
|
|
# Get the current schema version
|
|
try:
|
|
schema_version = trackme_schema_get_version(
|
|
reqinfo,
|
|
self.tenant_id,
|
|
schema_version_required,
|
|
task_name,
|
|
task_instance_id,
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function trackme_schema_get_version, exception="{str(e)}"'
|
|
)
|
|
|
|
# If schema_version_required is 0 (version retrieval failed), skip upgrade logic
|
|
# to align with graceful degradation when DB Connect causes permission issues
|
|
if schema_version_required == 0:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema_version_required is 0 (version retrieval failed), skipping schema upgrade logic to prevent data corruption.'
|
|
)
|
|
# Proceed
|
|
elif not schema_version or int(schema_version) != int(schema_version_required):
|
|
|
|
#
|
|
# Backup
|
|
#
|
|
|
|
# Check and act accordingly
|
|
trackme_backup_attempted = False
|
|
|
|
# Run TrackMe backup: verify if a backup was initiated or performed during the last 24 hours, otherwise initiate a backup
|
|
trackme_backup_run = True
|
|
|
|
# recent_backup_events_count
|
|
recent_backup_events_count = 0
|
|
|
|
# recent_backup_events_raw
|
|
recent_backup_events_raw = []
|
|
|
|
# run a Splunk search to identify the last backup initiated time
|
|
search = remove_leading_spaces(
|
|
f"""\
|
|
search (index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth task=schema_upgrade "initiating backup now") OR (index=_internal sourcetype=trackme:rest_api trackme.rest.backup_and_restore trackme_rest_handler_backup_and_restore.py post_backup "Backup archive created successfully") | stats count, values(_raw) as last_events
|
|
"""
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-24h",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, inspecting logs to identify any recent backups.'
|
|
)
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
recent_backup_events_count = int(item.get("count", 0))
|
|
recent_backup_events_raw = item.get("last_events", [])
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recent backup identification search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
|
|
# if we have detected a recent backup, we will not run a backup
|
|
if recent_backup_events_count > 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, recent backup was detected, no backup will be initiated, recent_backup_events_count="{recent_backup_events_count}", recent_backup_events_raw="{recent_backup_events_raw}"'
|
|
)
|
|
trackme_backup_run = False
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no recent backup was detected, initiating backup now, recent_backup_events_count="{recent_backup_events_count}", recent_backup_events_raw="{recent_backup_events_raw}"'
|
|
)
|
|
|
|
# before running the first function, execute TrackMe's builtin backup job
|
|
if trackme_backup_run:
|
|
if not trackme_backup_attempted:
|
|
try:
|
|
response = session.post(
|
|
f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/backup_and_restore/backup",
|
|
data=json.dumps(
|
|
{
|
|
"comment": f"Backup initiated for schema migration from version {schema_version} to {schema_version_required}"
|
|
}
|
|
),
|
|
verify=False,
|
|
timeout=900,
|
|
)
|
|
if response.status_code not in (200, 201, 204):
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
)
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call executed successfully'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, backup post call has failed, exception="{str(e)}"'
|
|
)
|
|
trackme_backup_attempted = True
|
|
|
|
#
|
|
# schema upgrade
|
|
#
|
|
|
|
for version, upgrade_func in schema_upgrades:
|
|
if not schema_version or int(schema_version) < version:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, detected migration required for schema version {version}, schema_version="{schema_version}", schema_version_required="{schema_version_required}", processing now.'
|
|
)
|
|
|
|
# proceed
|
|
try:
|
|
schema_version_update = upgrade_func(
|
|
reqinfo,
|
|
self.tenant_id,
|
|
int(schema_version),
|
|
int(schema_version_required),
|
|
task_name,
|
|
task_instance_id,
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema version {version} migrated successfully.'
|
|
)
|
|
|
|
# Update schema version after each successful upgrade
|
|
try:
|
|
schema_version_update = trackme_schema_update_version(
|
|
reqinfo,
|
|
self.tenant_id,
|
|
version, # Update to current version being processed
|
|
task_name,
|
|
task_instance_id,
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema version updated to {version} after successful upgrade.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to update schema version to {version}, exception="{str(e)}"'
|
|
)
|
|
raise # Re-raise the exception to stop the upgrade process
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function {upgrade_func.__name__}, exception="{str(e)}"'
|
|
)
|
|
raise # Re-raise the exception to stop the upgrade process
|
|
|
|
#
|
|
# finally migrate the schema version to the required version if not already there
|
|
#
|
|
|
|
try:
|
|
if int(schema_version) != int(schema_version_required):
|
|
schema_version_update = trackme_schema_update_version(
|
|
reqinfo,
|
|
self.tenant_id,
|
|
schema_version_required,
|
|
task_name,
|
|
task_instance_id,
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, final schema version updated to {schema_version_required}.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call function trackme_schema_update_version, exception="{str(e)}"'
|
|
)
|
|
|
|
#
|
|
# check if the vtenant is the last enabled vtenant to be upgraded, if so we will execute the general health tracker
|
|
#
|
|
|
|
vtenants_records = collection.data.query()
|
|
vtenants_remaining_count = 0
|
|
# iterate through vtenant records, count remaining vtenants to be upgraded
|
|
for record in vtenants_records:
|
|
schema_version_raw = record.get("schema_version")
|
|
# If schema_version is None (e.g., tenant was created when version retrieval failed),
|
|
# treat it as needing an upgrade
|
|
if schema_version_raw is None:
|
|
schema_version_needs_upgrade = True
|
|
else:
|
|
schema_version_needs_upgrade = int(schema_version_raw) != int(schema_version_required)
|
|
if (
|
|
schema_version_needs_upgrade
|
|
and record.get("tenant_status") == "enabled"
|
|
):
|
|
vtenants_remaining_count += 1
|
|
|
|
if vtenants_remaining_count == 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, all vtenants are up to date, executing the general health tracker'
|
|
)
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
"| savedsearch trackme_general_health_manager",
|
|
{
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
},
|
|
24,
|
|
5,
|
|
)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, general health tracker executed successfully'
|
|
)
|
|
|
|
except Exception as e:
|
|
msg = f'permanently failed to execute the general health tracker search, exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, schema is up to date, no action required, schema_version="{schema_version}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
#
|
|
#
|
|
|
|
#
|
|
# all components - inspect_collection
|
|
#
|
|
|
|
# context: this activty verifies that the collection record object statuses are consistent according to the Decision Maker
|
|
# It works by loading the component dta, then looping trough objects to verify and update their collection status if needed
|
|
|
|
for component in ("dsm", "dhm", "mhm", "wlk", "flx", "fqm"):
|
|
|
|
if vtenant_record.get(f"tenant_{component}_enabled") == True:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting collection records object statuses now.'
|
|
)
|
|
|
|
# set collection target
|
|
inspect_collection_name = (
|
|
f"kv_trackme_{component}_tenant_{self.tenant_id}"
|
|
)
|
|
inspect_collection = self.service.kvstore[inspect_collection_name]
|
|
|
|
#
|
|
# subtask: permanently_deleted_records_inspection
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_start = time.time()
|
|
task_name = "inspect_collection:permanently_deleted_records_inspection"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
#
|
|
# Check permanently deleted records:
|
|
# A permanently deleted record should not exist in the main KVstore collection, if it does, it should be purged
|
|
#
|
|
|
|
# Lists to store permanently deleted records found in anomaly
|
|
collection_permanently_deleted_records_anomaly = []
|
|
|
|
# search
|
|
search = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
|
|
| lookup trackme_common_permanently_deleted_objects_tenant_{self.tenant_id} object, object_category OUTPUT _key as permanently_deleted_keys
|
|
| where isnotnull(permanently_deleted_keys)
|
|
| table keyid, *
|
|
"""
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for permanently deleted records now.'
|
|
)
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
collection_permanently_deleted_records_anomaly.append(item)
|
|
|
|
except Exception as e:
|
|
msg = f'permanently deleted records inspection search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
if len(collection_permanently_deleted_records_anomaly) > 0:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, permanently deleted records found, no_records="{len(collection_permanently_deleted_records_anomaly)}"'
|
|
)
|
|
|
|
for record in collection_permanently_deleted_records_anomaly:
|
|
try:
|
|
inspect_collection.data.delete(
|
|
json.dumps({"_key": record.get("keyid")})
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities in the main collection which are also in ther permanently deleted records were purged successfully, keyid="{record.get("keyid")}", record="{json.dumps(record, indent=1)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to delete permanently deleted records in anmaly, keyid="{record.get("keyid")}", , record="{json.dumps(record, indent=1)}", exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no permanenlty deleted records in anomaly found'
|
|
)
|
|
|
|
#
|
|
# Check for any duplicated records in the permanently deleted records collection, based on the object field
|
|
#
|
|
|
|
permanently_deleted_records_collection_name = f"kv_trackme_common_permanently_deleted_objects_tenant_{self.tenant_id}"
|
|
permanently_deleted_records_collection = self.service.kvstore[permanently_deleted_records_collection_name]
|
|
(
|
|
permanently_deleted_records,
|
|
permanently_deleted_collection_keys,
|
|
permanently_deleted_collection_dict,
|
|
) = get_full_kv_collection(
|
|
permanently_deleted_records_collection, permanently_deleted_records_collection_name
|
|
)
|
|
|
|
# Detect duplicated records (same "(object, object_category)") and collect keys to delete (keep first seen)
|
|
duplicated_pd_keys = []
|
|
seen_pairs = set()
|
|
for pd_key, pd_record in permanently_deleted_collection_dict.items():
|
|
object_value = pd_record.get("object")
|
|
object_category = pd_record.get("object_category")
|
|
if not object_value or not object_category:
|
|
continue
|
|
pair = (object_value, object_category)
|
|
if pair in seen_pairs:
|
|
duplicated_pd_keys.append(pd_key)
|
|
else:
|
|
seen_pairs.add(pair)
|
|
|
|
if len(duplicated_pd_keys) > 0:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, permanently deleted records collection has duplicates, duplicates_count="{len(duplicated_pd_keys)}"'
|
|
)
|
|
for pd_key in duplicated_pd_keys:
|
|
try:
|
|
permanently_deleted_records_collection.data.delete(json.dumps({"_key": pd_key}))
|
|
# best-effort to fetch object for logging
|
|
pd_record = permanently_deleted_collection_dict.get(pd_key, {})
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicate in permanently deleted records purged successfully, keyid="{pd_key}", object="{pd_record.get("object")}", object_category="{pd_record.get("object_category")}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to purge duplicate in permanently deleted records, keyid="{pd_key}", exception="{str(e)}"'
|
|
)
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no duplicates found in permanently deleted records collection'
|
|
)
|
|
|
|
# end subtask
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# subtask: corrupted_records_inspection
|
|
#
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "inspect_collection:corrupted_records_inspection"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
#
|
|
# Check for unexpected corrupted record, a foreign record which has been stored in the KVstore by mistake
|
|
# would not have an object value, and would be purged if any.
|
|
#
|
|
|
|
# Lists to store corrupted records
|
|
collection_corrupted_records = []
|
|
|
|
# search
|
|
search = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
|
|
| where isnull(object) OR object=""
|
|
| table keyid, *
|
|
"""
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for corrupted records now.'
|
|
)
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
collection_corrupted_records.append(item)
|
|
|
|
except Exception as e:
|
|
msg = f'corrupted record inspection search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
if len(collection_corrupted_records) > 0:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, corrupted records found, no_records="{len(collection_corrupted_records)}"'
|
|
)
|
|
|
|
for corrupted_record in collection_corrupted_records:
|
|
try:
|
|
inspect_collection.data.delete(
|
|
json.dumps({"_key": corrupted_record.get("keyid")})
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, corrupted record deleted successfully, keyid="{corrupted_record.get("keyid")}", record="{json.dumps(corrupted_record, indent=1)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to delete corrupted record, keyid="{corrupted_record.get("keyid")}", , record="{json.dumps(corrupted_record, indent=1)}", exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no corrupted records found'
|
|
)
|
|
|
|
# end subtask
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# subtask: missing_tenant_id_records_inspection
|
|
#
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "inspect_collection:missing_tenant_id_records_inspection"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
#
|
|
# Check for records which would miss the tenant_id field, and add it if needed
|
|
#
|
|
|
|
# collection_misggin_tenant_id_records records
|
|
collection_missing_tenant_id_records = []
|
|
|
|
# search
|
|
search = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
|
|
| where isnull(tenant_id) OR tenant_id=""
|
|
| table keyid, *
|
|
"""
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for corrupted records now.'
|
|
)
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
collection_missing_tenant_id_records.append(item)
|
|
|
|
except Exception as e:
|
|
msg = f'missing tenant_id record inspection search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
if len(collection_missing_tenant_id_records) > 0:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, records found, no_records="{len(collection_missing_tenant_id_records)}"'
|
|
)
|
|
|
|
for missing_record in collection_missing_tenant_id_records:
|
|
try:
|
|
missing_record["tenant_id"] = self.tenant_id
|
|
inspect_collection.data.update(
|
|
missing_record.get("_key"),
|
|
json.dumps(missing_record),
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, record updated successfully, keyid="{missing_record.get("keyid")}", record="{json.dumps(missing_record, indent=1)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to update record, keyid="{missing_record.get("keyid")}", , record="{json.dumps(missing_record, indent=1)}", exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no records found'
|
|
)
|
|
|
|
# end subtask
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# subtask: entities_auto_disablement
|
|
#
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "inspect_collection:entities_auto_disablement"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
#
|
|
# Check for feeds entities to be disabled according to the system wide setting: splk_general_feeds_auto_disablement_period
|
|
# This setting allows to disable feeds entities if they have not been updated for a certain period of time
|
|
#
|
|
|
|
# system wide setting
|
|
try:
|
|
splk_general_feeds_auto_disablement_period = reqinfo["trackme_conf"][
|
|
"splk_general"
|
|
]["splk_general_feeds_auto_disablement_period"]
|
|
except Exception as e:
|
|
logging.warning(f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to get system wide setting, splk_general_feeds_auto_disablement_period, using default value, exception="{str(e)}"')
|
|
splk_general_feeds_auto_disablement_period = "90d"
|
|
|
|
# tenant setting (override system wide setting, if set)
|
|
try:
|
|
splk_feeds_auto_disablement_period = vtenant_account.get(
|
|
"splk_feeds_auto_disablement_period"
|
|
)
|
|
except Exception as e:
|
|
logging.warning(f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to get tenant setting, splk_feeds_auto_disablement_period, using system wide setting, exception="{str(e)}"')
|
|
splk_feeds_auto_disablement_period = splk_general_feeds_auto_disablement_period
|
|
|
|
# handle
|
|
auto_disablement_period = (
|
|
splk_feeds_auto_disablement_period
|
|
if splk_feeds_auto_disablement_period
|
|
else splk_general_feeds_auto_disablement_period
|
|
)
|
|
|
|
if auto_disablement_period != "0d" and component in (
|
|
"dsm",
|
|
"dhm",
|
|
"mhm",
|
|
):
|
|
|
|
# Lists to store entities to be disabled
|
|
entities_to_be_disabled = []
|
|
|
|
# search
|
|
search = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
|
|
| eval last_time_seen=coalesce(data_last_time_seen, metric_last_time_seen)
|
|
| where last_time_seen<=relative_time(now(), "-{auto_disablement_period}")
|
|
| table keyid, object, last_time_seen
|
|
| eval last_time_seen_human=strftime(last_time_seen, "%c")
|
|
"""
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting the main data collection for entities to be disabled according to auto-disablement setting. (auto_disablement_period="{auto_disablement_period}")'
|
|
)
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
entities_to_be_disabled.append(item.get("keyid"))
|
|
|
|
except Exception as e:
|
|
msg = f'auto-disablement record inspection search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
if len(entities_to_be_disabled) > 0:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities to be disabled were found, list="{entities_to_be_disabled}"'
|
|
)
|
|
|
|
# turn entities_to_be_disabled list into CSV
|
|
entities_to_be_disabled_csv = ",".join(entities_to_be_disabled)
|
|
|
|
# call mass disablement endpoint
|
|
if component == "dsm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_monitoring"
|
|
elif component == "dhm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dhm/write/dh_monitoring"
|
|
elif component == "mhm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_mhm/write/mh_monitoring"
|
|
|
|
try:
|
|
response = session.post(
|
|
target_url,
|
|
data=json.dumps(
|
|
{
|
|
"tenant_id": self.tenant_id,
|
|
"keys_list": entities_to_be_disabled_csv,
|
|
"action": "disable",
|
|
"update_comment": f"auto-disabled by the system, last seen data is beyond the system wide auto-disablement period of {splk_general_feeds_auto_disablement_period}",
|
|
}
|
|
),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
|
|
if response.status_code not in (200, 201, 204):
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, query has failed, response.status_code="{response.status_code}", response.text="{response.text}"'
|
|
logging.error(msg)
|
|
else:
|
|
try:
|
|
success_count = response.json().get("success_count")
|
|
except Exception as e:
|
|
success_count = 0
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, request was successful, success_count="{success_count}"'
|
|
logging.info(msg)
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, ctask="{task_name}", task_instance_id={task_instance_id}, request failed with exception="{str(e)}"'
|
|
logging.info(msg)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no entities to be disabled were found'
|
|
)
|
|
|
|
# end subtask
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# subtask: handle_sync_entities
|
|
#
|
|
|
|
task_start = time.time()
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "inspect_collection:handle_sync_entities"
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
#
|
|
# Inspecting statuses
|
|
#
|
|
|
|
#
|
|
# START raw collections records: Get raw collection records using a Splunk search
|
|
#
|
|
|
|
# search
|
|
search = remove_leading_spaces(
|
|
f"""\
|
|
| trackmegetcoll tenant_id="{self.tenant_id}" component="{component}" | fields - _raw | table *
|
|
| lookup trackme_{component}_tenant_{self.tenant_id} _key as keyid OUTPUT object_state as kvcoll_object_state, anomaly_reason as kvcoll_anomaly_reason, latest_flip_time as kvcoll_latest_flip_time
|
|
| where object_state!=kvcoll_object_state
|
|
"""
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
delta_records = []
|
|
delta_records_keys = set()
|
|
delta_records_objects = set()
|
|
delta_records_dict = {}
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
search,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
delta_records.append(item)
|
|
delta_records_keys.add(item.get("keyid"))
|
|
delta_records_objects.add(item.get("object"))
|
|
delta_records_dict[item.get("keyid")] = item
|
|
|
|
except Exception as e:
|
|
msg = f'main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
#
|
|
# END raw collections records: Get raw collection records using a Splunk search
|
|
#
|
|
|
|
#
|
|
# Handle delta records
|
|
#
|
|
|
|
inspectcollection_compare_records_start_time = time.time()
|
|
|
|
for item in delta_records:
|
|
|
|
item_key = item.get("keyid")
|
|
item_object = decode_unicode(item.get("object"))
|
|
item_alias = item.get("alias")
|
|
item_object_state = item.get("object_state")
|
|
item_object_category = item.get("object_category")
|
|
item_anomaly_reason = item.get("anomaly_reason")
|
|
item_monitored_state = item.get("monitored_state")
|
|
item_priority = item.get("priority")
|
|
|
|
# our delta state
|
|
collection_object_state = item.get("kvcoll_object_state")
|
|
|
|
# previous_anomaly_reason
|
|
collection_anomaly_reason = item.get(
|
|
"kvcoll_anomaly_reason", "unknown"
|
|
)
|
|
|
|
# previous flip time
|
|
try:
|
|
collection_latest_flip_time = float(
|
|
item.get("kvcoll_latest_flip_time", 0)
|
|
)
|
|
except Exception as e:
|
|
collection_latest_flip_time = 0
|
|
|
|
# disruption time
|
|
disruption_time = 0
|
|
|
|
# compare the object state with item_object_state using decisionmaker_collection_records_dict using the key
|
|
# if the object_state value is different, log the issue
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, collection record object state is not consistent, object="{item_object}", object_id="{item_key}", in_collection_object_state="{collection_object_state}", in_result_object_state="{item_object_state}", in_collection_anomaly_reason="{collection_anomaly_reason}"'
|
|
)
|
|
|
|
# get the current kvrecord
|
|
kvrecord_updated = False
|
|
|
|
try:
|
|
kvrecord = inspect_collection.data.query(
|
|
query=json.dumps({"_key": item_key})
|
|
)[0]
|
|
|
|
# update the kvrecord object_state, status_message and anomaly_reason
|
|
kvrecord["object_state"] = item_object_state
|
|
kvrecord["status_message"] = item.get("status_message")
|
|
kvrecord["anomaly_reason"] = item_anomaly_reason
|
|
kvrecord["mtime"] = time.time()
|
|
kvrecord["latest_flip_time"] = time.time()
|
|
kvrecord["latest_flip_state"] = item_object_state
|
|
|
|
# process the KVstore record update
|
|
inspect_collection.data.update(item_key, json.dumps(kvrecord))
|
|
|
|
kvrecord_updated = True
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, collection record object update successfully, object="{item_object}", object_id="{item_key}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, failed to update the KVstore record, object="{item_object}", collection_name="{collection_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# proceeed with next steps
|
|
if kvrecord_updated:
|
|
try:
|
|
|
|
# calculate disruption time if current_state is green and previous_state was red
|
|
if (
|
|
item_object_state == "green"
|
|
and collection_object_state == "red"
|
|
):
|
|
try:
|
|
disruption_time = round(
|
|
(time.time() - collection_latest_flip_time),
|
|
2,
|
|
)
|
|
except Exception as e:
|
|
disruption_time = 0
|
|
|
|
flip_timestamp = time.strftime(
|
|
"%d/%m/%Y %H:%M:%S",
|
|
time.localtime(time.time()),
|
|
)
|
|
disruption_time_str = f', disruption_time="{disruption_time}"' if disruption_time and disruption_time > 0 else ""
|
|
flip_result = f'{flip_timestamp}, object="{item_object}" has flipped from previous_state="{collection_object_state}" to state="{item_object_state}" with anomaly_reason="{item_anomaly_reason}", previous_anomaly_reason="{collection_anomaly_reason}"{disruption_time_str}'
|
|
|
|
flip_record = {
|
|
"timeStr": flip_timestamp,
|
|
"tenant_id": self.tenant_id,
|
|
"alias": item_alias,
|
|
"keyid": item_key,
|
|
"object": item_object,
|
|
"object_category": item_object_category,
|
|
"object_state": item_object_state,
|
|
"object_previous_state": collection_object_state,
|
|
"priority": item_priority,
|
|
"latest_flip_time": time.time(),
|
|
"latest_flip_state": item_object_state,
|
|
"anomaly_reason": item_anomaly_reason,
|
|
"result": flip_result,
|
|
}
|
|
|
|
# add event_id
|
|
flip_record["event_id"] = hashlib.sha256(
|
|
json.dumps(flip_record).encode()
|
|
).hexdigest()
|
|
|
|
trackme_gen_state(
|
|
index=tenant_indexes["trackme_summary_idx"],
|
|
sourcetype="trackme:flip",
|
|
source="flip_state_change_tracking",
|
|
event=flip_record,
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, TrackMe flipping event created successfully, record="{json.dumps(flip_record, indent=1)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, object="{item_object}", task="{task_name}", task_instance_id={task_instance_id}, record="{json.dumps(flip_record, indent=1)}", failed to generate a flipping state event with exception="{e}"'
|
|
)
|
|
|
|
#
|
|
# SLA metrics
|
|
#
|
|
|
|
# create a list for SLA metrics generation
|
|
sla_metrics_records = []
|
|
|
|
if item_object_state == "green":
|
|
object_num_state = 1
|
|
elif item_object_state == "red":
|
|
object_num_state = 2
|
|
elif item_object_state == "orange":
|
|
object_num_state = 3
|
|
elif item_object_state == "blue":
|
|
object_num_state = 4
|
|
else:
|
|
object_num_state = 5
|
|
|
|
# add to our list
|
|
sla_metrics_records.append(
|
|
{
|
|
"tenant_id": self.tenant_id,
|
|
"object_id": item_key,
|
|
"object": item_object,
|
|
"alias": item_alias,
|
|
"object_category": item_object_category,
|
|
"monitored_state": item_monitored_state,
|
|
"priority": item_priority,
|
|
"metrics_event": {"object_state": object_num_state},
|
|
}
|
|
)
|
|
|
|
# call the SLA gen metrics function
|
|
sla_metrics_gen_start = time.time()
|
|
try:
|
|
sla_metrics = trackme_sla_gen_metrics(
|
|
self.tenant_id,
|
|
tenant_indexes.get("trackme_metric_idx"),
|
|
sla_metrics_records,
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function trackme_sla_gen_metrics success {sla_metrics}, run_time={round(time.time()-sla_metrics_gen_start, 3)}, no_entities={len(sla_metrics_records)}'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, function trackme_sla_gen_metrics failed with exception {str(e)}'
|
|
)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, no_delta_records="{len(delta_records_keys)}", run_time="{round((time.time() - inspectcollection_compare_records_start_time), 3)}", collection="{inspect_collection_name}"'
|
|
)
|
|
|
|
#
|
|
# END comparison
|
|
#
|
|
|
|
# end subtask
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 3)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
#
|
|
#
|
|
|
|
#
|
|
# Call the trackme_register_tenant_component_summary
|
|
#
|
|
|
|
# Use threading to do an async call to the register summary without waiting for it to complete
|
|
thread = threading.Thread(
|
|
target=self.register_component_summary_async,
|
|
args=(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
component,
|
|
),
|
|
)
|
|
thread.start()
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, trackme_register_tenant_component_summary was requested.'
|
|
)
|
|
|
|
#
|
|
# task: untracked_entities
|
|
#
|
|
|
|
#
|
|
# splk-dsm - untracked entities
|
|
#
|
|
|
|
# context: this activity tracks and maintain state for untracked entities
|
|
# untracked entities are entities which are entirely out of the scope of any trackers, and therefore not maintained otherwise
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "untracked_entities"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
if vtenant_record.get("tenant_dsm_enabled") == True:
|
|
component = "dsm"
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting untracked entities now.'
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
untracked_entities_count = 0
|
|
untracked_entities_processed_objects = []
|
|
|
|
untracked_entities_search = f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval key=_key
|
|
|
|
``` target any entity that has not been updated since more than 15m ```
|
|
| eval time_sec_since_inspection=now()-tracker_runtime
|
|
| where ( time_sec_since_inspection>900 OR isnull(tracker_runtime) )
|
|
|
|
``` called the offline abstract macro version ```
|
|
`trackme_{component}_tracker_abstract({self.tenant_id})`
|
|
|
|
``` collects latest collection state into the summary index ```
|
|
| `trackme_collect_state("current_state_tracking:splk-{component}:{self.tenant_id}", "object", "{self.tenant_id}")`
|
|
|
|
``` output flipping change status if changes ```
|
|
| trackmesplkgetflipping tenant_id="{self.tenant_id}" object_category="splk-{component}"
|
|
|
|
``` update the KVstore collection ```
|
|
| `trackme_outputlookup_tracker_health(trackme_{component}_tenant_{self.tenant_id}, key)`
|
|
|
|
``` update the delay metric only ```
|
|
| `trackme_mcollect(object, splk-{component}, "metric_name:trackme.splk.feeds.lag_event_sec=data_last_lag_seen", "tenant_id, object_category, object", "{self.tenant_id}")`
|
|
|
|
``` summarize job ```
|
|
| stats count as report_entities_count, values(object) as objects by tenant_id
|
|
"""
|
|
|
|
# run the main report, every result is a Splunk search to be executed on its own thread
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
untracked_entities_search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
untracked_entities_count += 1
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities_count="{len(item)}"'
|
|
)
|
|
untracked_entities_processed_objects = item.get("objects", [])
|
|
|
|
if untracked_entities_count == 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no untracked entities currently.'
|
|
)
|
|
|
|
except Exception as e:
|
|
# Call the component register
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
"all",
|
|
report_name,
|
|
"failure",
|
|
time.time(),
|
|
str(time.time() - start),
|
|
str(e),
|
|
"-5m",
|
|
"now",
|
|
)
|
|
msg = f'task="{task_name}", task_instance_id={task_instance_id}, tenant_id="{self.tenant_id}", main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
if untracked_entities_processed_objects:
|
|
handler_events_records = []
|
|
for object_name in untracked_entities_processed_objects:
|
|
handler_events_records.append(
|
|
{
|
|
"object": object_name,
|
|
"object_category": f"splk-{component}",
|
|
"object_id": hashlib.sha256(
|
|
object_name.encode("utf-8")
|
|
).hexdigest(),
|
|
"handler": "health_tracker:untracked_entities",
|
|
"handler_message": "Entity was inspected by the heath tracker, it is out of the scope of any hybrid tracker due to high delay and/or latency.",
|
|
"handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth tenant_id={self.tenant_id} component=splk-{component} task=untracked_entities",
|
|
"handler_time": time.time(),
|
|
}
|
|
)
|
|
|
|
# notification event
|
|
try:
|
|
trackme_handler_events(
|
|
session_key=self._metadata.searchinfo.session_key,
|
|
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id=self.tenant_id,
|
|
sourcetype="trackme:handler",
|
|
source=f"trackme:handler:{self.tenant_id}",
|
|
handler_events=handler_events_records,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{component}", could not send notification event, exception="{e}"'
|
|
)
|
|
|
|
#
|
|
# splk-dhm - untracked entities
|
|
#
|
|
# context: this activity tracks and maintain state for untracked entities
|
|
# untracked entities are entities which are entirely out of the scope of any trackers, and therefore not maintained otherwise
|
|
|
|
if vtenant_record.get("tenant_dhm_enabled") == True:
|
|
component = "dhm"
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting untracked entities now.'
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
untracked_entities_count = 0
|
|
untracked_entities_processed_objects = []
|
|
|
|
untracked_entities_search = f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval key=_key
|
|
|
|
``` target any entity that has not been updated since more than 15m ```
|
|
| eval time_sec_since_inspection=now()-tracker_runtime
|
|
| where ( time_sec_since_inspection>900 OR isnull(tracker_runtime) )
|
|
|
|
``` called the offline abstract macro version ```
|
|
`trackme_{component}_tracker_abstract({self.tenant_id})`
|
|
|
|
``` collects latest collection state into the summary index ```
|
|
| `trackme_collect_state("current_state_tracking:splk-{component}:{self.tenant_id}", "object", "{self.tenant_id}")`
|
|
|
|
``` output flipping change status if changes ```
|
|
| trackmesplkgetflipping tenant_id="{self.tenant_id}" object_category="splk-{component}"
|
|
|
|
``` update the KVstore collection ```
|
|
| `trackme_outputlookup_tracker_health(trackme_{component}_tenant_{self.tenant_id}, key)`
|
|
|
|
``` update the delay metric only ```
|
|
| `trackme_mcollect(object, splk-{component}, "metric_name:trackme.splk.feeds.lag_event_sec=data_last_lag_seen", "tenant_id, object_category, object", "{self.tenant_id}")`
|
|
|
|
``` summarize job ```
|
|
| stats count as report_entities_count, values(object) as objects by tenant_id
|
|
"""
|
|
|
|
# run the main report, every result is a Splunk search to be executed on its own thread
|
|
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
untracked_entities_search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
untracked_entities_count += 1
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, entities_count="{len(item)}"'
|
|
)
|
|
untracked_entities_processed_objects = item.get("objects", [])
|
|
if untracked_entities_count == 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no untracked entities currently.'
|
|
)
|
|
|
|
except Exception as e:
|
|
# Call the component register
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
"all",
|
|
report_name,
|
|
"failure",
|
|
time.time(),
|
|
str(time.time() - start),
|
|
str(e),
|
|
"-5m",
|
|
"now",
|
|
)
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
if untracked_entities_processed_objects:
|
|
|
|
# if untracked_entities_processed_objects is a string (a single object was reported), convert it to a list
|
|
if isinstance(untracked_entities_processed_objects, str):
|
|
untracked_entities_processed_objects = [
|
|
untracked_entities_processed_objects
|
|
]
|
|
|
|
handler_events_records = []
|
|
for object_name in untracked_entities_processed_objects:
|
|
handler_events_records.append(
|
|
{
|
|
"object": object_name,
|
|
"object_id": hashlib.sha256(
|
|
object_name.encode("utf-8")
|
|
).hexdigest(),
|
|
"object_category": f"splk-{component}",
|
|
"handler": "health_tracker:untracked_entities",
|
|
"handler_message": "Entity was inspected by the heath tracker, it is out of the scope of any hybrid tracker due to high delay and/or latency.",
|
|
"handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmetrackerhealth tenant_id={self.tenant_id} component=splk-{component} task=untracked_entities",
|
|
"handler_time": time.time(),
|
|
}
|
|
)
|
|
|
|
# notification event
|
|
try:
|
|
trackme_handler_events(
|
|
session_key=self._metadata.searchinfo.session_key,
|
|
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id=self.tenant_id,
|
|
sourcetype="trackme:handler",
|
|
source=f"trackme:handler:{self.tenant_id}",
|
|
handler_events=handler_events_records,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{component}", could not send notification event, exception="{e}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: duplicated_entities
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "duplicated_entities"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# all components except splk-wlk
|
|
#
|
|
# context: this situation is not expected, but if we have duplicated entities, we need to verify and purge them
|
|
|
|
# splk-wlk - duplicated entities
|
|
#
|
|
# context: this activity tracks for duplicated entities in the Workload component
|
|
# under some rare circumstances, the Splunk scheduler logs may lack the user context, althrough we implement several safeties
|
|
# if this happens, we need to verify and purge any duplicated entity with the system user context instead of the proper user context
|
|
|
|
for component in ("dsm", "dhm", "mhm", "wlk", "flx", "fqm"):
|
|
|
|
if (
|
|
vtenant_record.get(f"tenant_{component}_enabled") == True
|
|
and vtenant_record.get("tenant_replica") == False
|
|
):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, inspecting entities now.'
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
duplicated_entities_count = 0
|
|
duplicated_entities_list = []
|
|
|
|
# specific search for wlk
|
|
if component == "wlk":
|
|
duplicated_entities_search = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_wlk_tenant_{self.tenant_id} | eval keyid=_key
|
|
| fields keyid, account, app, user, savedsearch_name, object, last_seen
|
|
| eventstats count as dcount by account, app, savedsearch_name
|
|
| where dcount>1
|
|
| sort - 0 savedsearch_name, last_seen
|
|
"""
|
|
)
|
|
|
|
else: # other components
|
|
duplicated_entities_search = remove_leading_spaces(
|
|
f"""\
|
|
| inputlookup trackme_{component}_tenant_{self.tenant_id} | eval keyid=_key
|
|
| sort 0 object
|
|
| eventstats count as dcount by object
|
|
| streamstats count as rank by object
|
|
| where dcount>1
|
|
``` handle rank if the duplicated is due to FIPS migration ```
|
|
| eval rank=if(len(keyid) == 64, 2, 1)
|
|
| where rank=1
|
|
"""
|
|
)
|
|
|
|
# run the main report, every result is a Splunk search to be executed on its own thread
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, executing search="{duplicated_entities_search}"'
|
|
)
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
duplicated_entities_search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
duplicated_entities_count += 1
|
|
duplicated_entities_list.append(item.get("keyid"))
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, detected duplicated entity, keyid="{item.get("keyid")}", object="{item.get("object")}"'
|
|
)
|
|
|
|
if duplicated_entities_count == 0:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, there are no duplicated entities currently.'
|
|
)
|
|
|
|
except Exception as e:
|
|
# Call the component register
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
"all",
|
|
report_name,
|
|
"failure",
|
|
time.time(),
|
|
str(time.time() - start),
|
|
str(e),
|
|
"-5m",
|
|
"now",
|
|
)
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
# process if needed
|
|
if duplicated_entities_count > 0:
|
|
|
|
# target
|
|
if component == "dsm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_delete"
|
|
|
|
elif component == "dhm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dhm/write/dh_delete"
|
|
|
|
if component == "mhm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_mhm/write/mh_delete"
|
|
|
|
if component == "flx":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_flx/write/flx_delete"
|
|
|
|
if component == "fqm":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_fqm/write/fqm_delete"
|
|
|
|
if component == "wlk":
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_wlk/write/wlk_delete"
|
|
|
|
# data
|
|
# turn duplicated_entities_list into a comma separated string
|
|
|
|
# update comment
|
|
if component == "wlk":
|
|
update_comment = "One or more duplicated entities were detected by the health tracker, this condition can happen when Splunk scheduler logs lack the user context, automated purge of these entities."
|
|
else:
|
|
update_comment = "One or more duplicated entities were detected by the health tracker, this condition is not expected and TrackMe needs to purge duplicates to avoid further issues."
|
|
|
|
duplicated_entities_list = ",".join(duplicated_entities_list)
|
|
post_data = {
|
|
"tenant_id": self.tenant_id,
|
|
"keys_list": duplicated_entities_list,
|
|
"deletion_type": "temporary",
|
|
"update_comment": update_comment,
|
|
}
|
|
|
|
try:
|
|
response = session.post(
|
|
target_url,
|
|
data=json.dumps(post_data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicated entities purge successful, results="{json.dumps(response.json(), indent=2)}"'
|
|
logging.info(msg)
|
|
|
|
except Exception as e:
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, duplicated entities purge failed with exception="{str(e)}"'
|
|
logging.info(msg)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: check_trackers_collections
|
|
#
|
|
|
|
# this task is designed to verify that trackers referenced in the dedicated collections are still present in the system
|
|
# if not, it will remove the tracker from the collection
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_trackers_collections"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
def check_trackers_existence(vtenant_record, component):
|
|
logging.info(f"Checking tracker definitions for component: {component}")
|
|
|
|
# Load the tracker collection associated with the component (source of truth)
|
|
tracker_collection_name = (
|
|
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
|
|
)
|
|
tracker_collection = self.service.kvstore[tracker_collection_name]
|
|
|
|
# Get all the tracker records
|
|
tracker_records = tracker_collection.data.query()
|
|
|
|
for tracker_record in tracker_records:
|
|
record_knowledge_objects = json.loads(
|
|
tracker_record.get("knowledge_objects", "{}")
|
|
)
|
|
|
|
# get the reports list
|
|
reports_list = record_knowledge_objects.get("reports", [])
|
|
|
|
# identify the main tracker (tracker_main_name) which contains _tracker_tenant_ in the name
|
|
tracker_main_name = None
|
|
for report_name in reports_list:
|
|
if "_tracker_tenant_" in report_name:
|
|
tracker_main_name = report_name
|
|
break
|
|
|
|
# Verify the existence of the main tracker, if it cannot be found in the system, the entire record will be removed from the collection
|
|
purge_tracker_record = False
|
|
|
|
# the main tracker was found in the record
|
|
if tracker_main_name:
|
|
|
|
# process
|
|
savedsearch_definition = None
|
|
try:
|
|
savedsearch = self.service.saved_searches[tracker_main_name]
|
|
savedsearch_definition = savedsearch.content["search"]
|
|
savedsearch_content = savedsearch.content
|
|
except Exception as e:
|
|
savedsearch_definition = None
|
|
savedsearch_content = {}
|
|
|
|
# purge if necessary
|
|
if not savedsearch_definition:
|
|
purge_tracker_record = True
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the main tracker="{tracker_main_name}" does not exist anymore, the tracker record will be removed from the collection.'
|
|
)
|
|
|
|
else: # the main tracker was not found in the record, the record is considered as invalid and will be removed from the collection
|
|
purge_tracker_record = True
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record="{tracker_record}" is invalid, the tracker record will be removed from the collection.'
|
|
)
|
|
|
|
# purge if necessary
|
|
if purge_tracker_record:
|
|
|
|
try:
|
|
tracker_collection.data.delete(
|
|
json.dumps({"_key": tracker_record.get("_key")})
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record was successfully removed from the collection.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, the tracker record failed to be removed from the collection, exception="{str(e)}"'
|
|
)
|
|
|
|
def recreate_missing_tracker_records(vtenant_record, component):
|
|
"""
|
|
Recreate hybrid tracker records in dedicated KVstore if they exist in
|
|
tenant_<component>_hybrid_objects but are missing from the dedicated collection.
|
|
"""
|
|
logging.info(f"Checking for missing tracker records to recreate for component: {component}")
|
|
|
|
# Load the tenant hybrid objects from vtenant_record (central source)
|
|
hybrid_objects_json = vtenant_record.get(f"tenant_{component}_hybrid_objects")
|
|
|
|
if not hybrid_objects_json:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, No hybrid objects found in vtenant_record for component "{component}", skipping recreation check.'
|
|
)
|
|
return
|
|
|
|
try:
|
|
hybrid_objects = json.loads(hybrid_objects_json)
|
|
except Exception as e:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to parse hybrid_objects JSON, exception="{str(e)}"'
|
|
)
|
|
return
|
|
|
|
reports_list = hybrid_objects.get("reports", [])
|
|
macros_list = hybrid_objects.get("macros", [])
|
|
|
|
if not reports_list:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, No reports found in hybrid_objects for component "{component}", skipping recreation check.'
|
|
)
|
|
return
|
|
|
|
# Load the dedicated tracker collection
|
|
tracker_collection_name = (
|
|
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
|
|
)
|
|
tracker_collection = self.service.kvstore[tracker_collection_name]
|
|
|
|
# Get existing tracker records from dedicated collection
|
|
existing_tracker_records = tracker_collection.data.query()
|
|
existing_tracker_names = set()
|
|
|
|
for record in existing_tracker_records:
|
|
tracker_name = record.get("tracker_name")
|
|
if tracker_name:
|
|
existing_tracker_names.add(tracker_name)
|
|
|
|
# Process wrapper reports to extract tracker names
|
|
# Pattern: trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
|
|
wrapper_prefix = f"trackme_{component}_hybrid_"
|
|
wrapper_suffix = f"_wrapper_tenant_{self.tenant_id}"
|
|
|
|
# Track trackers we've already processed to avoid duplicates
|
|
processed_trackers = {}
|
|
|
|
for report_name in reports_list:
|
|
# Only process wrapper reports to identify trackers
|
|
if "_wrapper_" not in report_name:
|
|
continue
|
|
|
|
# Extract tracker_name from wrapper report name
|
|
# Pattern: trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
|
|
if report_name.startswith(wrapper_prefix) and report_name.endswith(wrapper_suffix):
|
|
# Remove prefix and suffix to get tracker_name
|
|
tracker_name = report_name[len(wrapper_prefix):-len(wrapper_suffix)]
|
|
|
|
# Check if this tracker exists in the dedicated collection
|
|
if tracker_name not in existing_tracker_names and tracker_name not in processed_trackers:
|
|
# Collect all reports and macros for this tracker
|
|
tracker_reports = []
|
|
tracker_macros = []
|
|
|
|
# Find all reports that belong to this tracker
|
|
# Use explicit expected report name construction for precise matching
|
|
# This avoids issues with reserved words (abstract, wrapper, tracker) and substring matches
|
|
# Reports patterns vary by component:
|
|
# - Components with abstract (dsm, dhm, mhm):
|
|
# * trackme_<component>_hybrid_abstract_<tracker_name>_tenant_<tenant_id>
|
|
# * trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
|
|
# * trackme_<component>_hybrid_<tracker_name>_tracker_tenant_<tenant_id>
|
|
# - Components without abstract (flx, wlk, fqm):
|
|
# * trackme_<component>_hybrid_<tracker_name>_wrapper_tenant_<tenant_id>
|
|
# * trackme_<component>_hybrid_<tracker_name>_tracker_tenant_<tenant_id>
|
|
|
|
# Construct expected report names explicitly for exact matching
|
|
expected_reports = []
|
|
# Components with abstract reports: dsm, dhm, mhm
|
|
if component in ["dsm", "dhm", "mhm"]:
|
|
expected_reports.append(f"trackme_{component}_hybrid_abstract_{tracker_name}_tenant_{self.tenant_id}")
|
|
# All components have wrapper and tracker reports
|
|
expected_reports.append(f"trackme_{component}_hybrid_{tracker_name}_wrapper_tenant_{self.tenant_id}")
|
|
expected_reports.append(f"trackme_{component}_hybrid_{tracker_name}_tracker_tenant_{self.tenant_id}")
|
|
|
|
# Match reports using exact names
|
|
for report in reports_list:
|
|
if report in expected_reports:
|
|
tracker_reports.append(report)
|
|
|
|
# Find all macros that belong to this tracker
|
|
# Note: Macros are only applicable to dsm, dhm, mhm components
|
|
# Macro pattern: trackme_<component>_hybrid_root_constraint_<tracker_name>_tenant_<tenant_id>
|
|
# Use exact expected macro name for matching (similar to reports above)
|
|
if component in ["dsm", "dhm", "mhm"]:
|
|
expected_macro = f"trackme_{component}_hybrid_root_constraint_{tracker_name}_tenant_{self.tenant_id}"
|
|
if expected_macro in macros_list:
|
|
tracker_macros.append(expected_macro)
|
|
|
|
# Only proceed if we have at least one report
|
|
if tracker_reports:
|
|
processed_trackers[tracker_name] = {
|
|
"reports": tracker_reports,
|
|
"macros": tracker_macros
|
|
}
|
|
|
|
# Recreate missing tracker records
|
|
for tracker_name, knowledge_data in processed_trackers.items():
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Recreating missing tracker record for "{tracker_name}" in dedicated collection.'
|
|
)
|
|
|
|
# Build knowledge_objects structure (without properties as per requirement)
|
|
knowledge_objects = {
|
|
"reports": knowledge_data["reports"]
|
|
}
|
|
|
|
# Add macros if present (only for components that use them)
|
|
if knowledge_data["macros"]:
|
|
knowledge_objects["macros"] = knowledge_data["macros"]
|
|
|
|
# Create the tracker record
|
|
new_tracker_record = {
|
|
"_key": hashlib.sha256(tracker_name.encode("utf-8")).hexdigest(),
|
|
"tracker_name": tracker_name,
|
|
"knowledge_objects": json.dumps(knowledge_objects, indent=2),
|
|
"created_time": time.time(),
|
|
"created_by": "health_tracker"
|
|
}
|
|
|
|
# Add component-specific fields
|
|
if component == "wlk":
|
|
# wlk tracker records require tracker_type field
|
|
# tracker_name format is: {tracker_type}_{uuid}
|
|
# Extract tracker_type from tracker_name
|
|
# Note: Some tracker types contain underscores (e.g., inactive_entities, splunkcloud_svc)
|
|
# so we need to check for multi-word types first before falling back to simple split
|
|
valid_wlk_tracker_types = [
|
|
"main", "introspection", "scheduler", "metadata",
|
|
"orphan", "inactive_entities", "splunkcloud_svc", "notable"
|
|
]
|
|
extracted_tracker_type = None
|
|
# First, try to match known multi-word tracker types
|
|
for valid_type in valid_wlk_tracker_types:
|
|
if tracker_name.startswith(valid_type + "_") or tracker_name == valid_type:
|
|
extracted_tracker_type = valid_type
|
|
break
|
|
# If no match found, fall back to simple split for single-word types
|
|
if not extracted_tracker_type and "_" in tracker_name:
|
|
first_segment = tracker_name.split("_", 1)[0]
|
|
if first_segment in valid_wlk_tracker_types:
|
|
extracted_tracker_type = first_segment
|
|
|
|
if extracted_tracker_type:
|
|
new_tracker_record["tracker_type"] = extracted_tracker_type
|
|
else:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Could not extract valid tracker_type from tracker_name="{tracker_name}" (expected format: tracker_type_uuid)'
|
|
)
|
|
elif component in ["flx", "fqm"]:
|
|
# flx and fqm use tracker_id field
|
|
new_tracker_record["tracker_id"] = tracker_name
|
|
|
|
try:
|
|
# Final safety check: verify the tracker doesn't exist before insertion
|
|
final_check = tracker_collection.data.query(
|
|
query=json.dumps({"tracker_name": tracker_name})
|
|
)
|
|
if not final_check:
|
|
tracker_collection.data.insert(json.dumps(new_tracker_record))
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Successfully recreated tracker record for "{tracker_name}" in dedicated collection.'
|
|
)
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{tracker_name}" already exists in dedicated collection, skipping recreation.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to recreate tracker record for "{tracker_name}", exception: {str(e)}'
|
|
)
|
|
|
|
# Main logic
|
|
components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]
|
|
for component in components:
|
|
if vtenant_record.get(f"tenant_{component}_enabled"):
|
|
check_trackers_existence(vtenant_record, component)
|
|
recreate_missing_tracker_records(vtenant_record, component)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: check_trackers
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_trackers_definition"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
def check_trackers_definition(vtenant_record, component):
|
|
logging.info(f"Checking tracker definitions for component: {component}")
|
|
|
|
# Load the tracker collection associated with the component (source of truth)
|
|
tracker_collection_name = (
|
|
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
|
|
)
|
|
tracker_collection = self.service.kvstore[tracker_collection_name]
|
|
|
|
# Get all the tracker records
|
|
tracker_records = tracker_collection.data.query()
|
|
|
|
# Initialize empty sets for the reports and macros that should be in the vtenant_record
|
|
truth_reports = set()
|
|
truth_macros = set()
|
|
|
|
for tracker_record in tracker_records:
|
|
record_knowledge_objects = json.loads(
|
|
tracker_record.get("knowledge_objects", "{}")
|
|
)
|
|
|
|
# Collect the reports and macros from the tracker record's knowledge_objects
|
|
truth_reports.update(record_knowledge_objects.get("reports", []))
|
|
truth_macros.update(record_knowledge_objects.get("macros", []))
|
|
|
|
# Load the current tenant hybrid objects from vtenant_record (destination)
|
|
hybrid_objects_json = vtenant_record.get(
|
|
f"tenant_{component}_hybrid_objects"
|
|
)
|
|
|
|
if hybrid_objects_json:
|
|
# Load the JSON object from the hybrid_objects field
|
|
hybrid_objects = json.loads(hybrid_objects_json)
|
|
else:
|
|
# If no existing hybrid_objects, initialize an empty structure
|
|
hybrid_objects = {"reports": [], "macros": []}
|
|
|
|
vtenant_reports = set(hybrid_objects.get("reports", []))
|
|
vtenant_macros = set(hybrid_objects.get("macros", []))
|
|
|
|
# Compare and find missing reports/macros in the vtenant_record
|
|
missing_reports = truth_reports - vtenant_reports
|
|
missing_macros = truth_macros - vtenant_macros
|
|
|
|
# If there are any missing reports or macros, add them to the vtenant_record
|
|
if missing_reports or missing_macros:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Missing reports: {missing_reports} or macros: {missing_macros} in vtenant_record.'
|
|
)
|
|
|
|
# Update the vtenant_record with missing reports and macros
|
|
hybrid_objects["reports"] = list(vtenant_reports.union(truth_reports))
|
|
hybrid_objects["macros"] = list(vtenant_macros.union(truth_macros))
|
|
|
|
# Save the updated hybrid objects back to the vtenant_record
|
|
vtenant_record[f"tenant_{component}_hybrid_objects"] = json.dumps(
|
|
hybrid_objects, indent=2
|
|
)
|
|
|
|
try:
|
|
self.service.kvstore["kv_trackme_virtual_tenants"].data.update(
|
|
str(vtenant_record["_key"]), json.dumps(vtenant_record)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
|
|
)
|
|
|
|
def check_trackers_existence_in_dedicated_kvstore(vtenant_record, component):
|
|
logging.info(f"Checking tracker existence in dedicated KVstore for component: {component}")
|
|
|
|
# Load the central KVstore collection to get all tracker records
|
|
central_collection_name = f"kv_trackme_{component}_tenant_{self.tenant_id}"
|
|
try:
|
|
central_collection = self.service.kvstore[central_collection_name]
|
|
central_records = central_collection.data.query()
|
|
except Exception as e:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Central collection "{central_collection_name}" not found or accessible, exception: {str(e)}'
|
|
)
|
|
return
|
|
|
|
# Load the dedicated tracker collection
|
|
tracker_collection_name = (
|
|
f"kv_trackme_{component}_hybrid_trackers_tenant_{self.tenant_id}"
|
|
)
|
|
tracker_collection = self.service.kvstore[tracker_collection_name]
|
|
|
|
# Get existing tracker records from dedicated collection
|
|
existing_tracker_records = tracker_collection.data.query()
|
|
existing_tracker_names = set()
|
|
|
|
for record in existing_tracker_records:
|
|
tracker_name = record.get("tracker_name")
|
|
if tracker_name:
|
|
existing_tracker_names.add(tracker_name)
|
|
|
|
# Track tracker names being processed in this batch to prevent duplicates
|
|
processing_tracker_names = set()
|
|
|
|
# Process each central record to find tracker names
|
|
for central_record in central_records:
|
|
tracker_name = central_record.get("tracker_name")
|
|
if not tracker_name:
|
|
continue
|
|
|
|
# Check if tracker_name is a JSON array (concurrent tracker format)
|
|
# If it's a JSON array, skip it - these are normalized tracker names, not full report names
|
|
# We only process full report names that match the hybrid pattern
|
|
try:
|
|
if isinstance(tracker_name, str):
|
|
parsed_tracker_name = json.loads(tracker_name)
|
|
if isinstance(parsed_tracker_name, list):
|
|
# This is a JSON array of normalized tracker names, skip it
|
|
# These are from concurrent trackers and don't need hybrid tracker records
|
|
continue
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Not a JSON array, continue processing as a string
|
|
pass
|
|
|
|
# Extract the base tracker name by removing _wrapper_tenant_ or _tracker_tenant_ suffix
|
|
base_tracker_name = None
|
|
if "_wrapper_tenant_" in tracker_name:
|
|
base_tracker_name = tracker_name.split("_wrapper_tenant_")[0]
|
|
elif "_tracker_tenant_" in tracker_name:
|
|
base_tracker_name = tracker_name.split("_tracker_tenant_")[0]
|
|
|
|
if not base_tracker_name:
|
|
continue
|
|
|
|
# Remove the trackme_<component>_hybrid_ prefix to get the actual tracker name
|
|
# This applies to all components that follow this naming convention
|
|
expected_prefix = f"trackme_{component}_hybrid_"
|
|
if base_tracker_name.startswith(expected_prefix):
|
|
actual_tracker_name = base_tracker_name.replace(expected_prefix, "", 1)
|
|
else:
|
|
actual_tracker_name = base_tracker_name
|
|
|
|
# Check if this tracker exists in the dedicated collection (by name or ID)
|
|
# Also check if we're already processing this tracker name in this batch
|
|
if (actual_tracker_name not in existing_tracker_names and
|
|
actual_tracker_name not in processing_tracker_names):
|
|
|
|
# Add to processing set to prevent duplicates in this batch
|
|
processing_tracker_names.add(actual_tracker_name)
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{actual_tracker_name}" not found in dedicated collection, creating record.'
|
|
)
|
|
|
|
# Create a new tracker record in the dedicated collection
|
|
# Build knowledge_objects with both wrapper and tracker reports
|
|
reports_list = []
|
|
|
|
# Add both wrapper and tracker reports
|
|
wrapper_name = tracker_name.replace("_tracker_tenant_", "_wrapper_tenant_")
|
|
reports_list = [wrapper_name, tracker_name]
|
|
|
|
# Build knowledge_objects structure
|
|
knowledge_objects = {
|
|
"reports": reports_list
|
|
}
|
|
|
|
# Macros are only applicable to dsm, dhm, mhm components
|
|
if component in ["dsm", "dhm", "mhm"]:
|
|
# Extract the tracker identifier from the base tracker name
|
|
# Example: trackme_dsm_hybrid_tracker-iew8hkxv -> tracker-iew8hkxv
|
|
if "_hybrid_" in base_tracker_name:
|
|
tracker_identifier = base_tracker_name.split("_hybrid_")[1]
|
|
macro_name = f"trackme_{component}_hybrid_root_constraint_{tracker_identifier}_tenant_{self.tenant_id}"
|
|
knowledge_objects["macros"] = [macro_name]
|
|
|
|
new_tracker_record = {
|
|
"tracker_name": actual_tracker_name,
|
|
"tracker_id": actual_tracker_name, # tracker_id should equal tracker_name
|
|
"knowledge_objects": json.dumps(knowledge_objects, indent=2),
|
|
"created_time": time.time(),
|
|
"created_by": "health_tracker"
|
|
}
|
|
|
|
try:
|
|
# Final safety check: verify the tracker doesn't exist before insertion
|
|
final_check = tracker_collection.data.query(query=json.dumps({"tracker_name": actual_tracker_name}))
|
|
if not final_check:
|
|
tracker_collection.data.insert(json.dumps(new_tracker_record))
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Successfully created tracker record for "{actual_tracker_name}" in dedicated collection.'
|
|
)
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Tracker "{actual_tracker_name}" already exists in dedicated collection, skipping creation.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, Failed to create tracker record for "{actual_tracker_name}", exception: {str(e)}'
|
|
)
|
|
|
|
# Main logic
|
|
components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]
|
|
for component in components:
|
|
if vtenant_record.get(f"tenant_{component}_enabled"):
|
|
check_trackers_definition(vtenant_record, component)
|
|
check_trackers_existence_in_dedicated_kvstore(vtenant_record, component)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: check_alerts_definition
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_alerts_definition"
|
|
task_start = time.time()
|
|
|
|
#
|
|
# Verify for each tenant record the content of tenant_alert_objects
|
|
# - load the tenant_alert_objects object
|
|
# - For each alert, verify that the alert exists in the system
|
|
# - if not, remove the alert from the tenant_alert_objects object and update the record
|
|
#
|
|
|
|
def check_alerts_definition(alert_name):
|
|
|
|
# get the current search definition
|
|
try:
|
|
alert_current = self.service.saved_searches[alert_name]
|
|
alert_current_search = alert_current.content.get("search")
|
|
return True
|
|
|
|
except Exception as e:
|
|
return False
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# Load the tenant_alert_objects object
|
|
tenant_alert_objects = vtenant_record.get("tenant_alert_objects", {})
|
|
if tenant_alert_objects:
|
|
try:
|
|
tenant_alert_objects = json.loads(tenant_alert_objects)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to load tenant_alert_objects, exception: {str(e)}'
|
|
)
|
|
tenant_alert_objects = {}
|
|
|
|
# alerts is a list stored in "alerts" key
|
|
alerts = tenant_alert_objects.get("alerts", [])
|
|
|
|
# verify each alert
|
|
alerts_were_removed = False
|
|
for alert_name in alerts:
|
|
alert_exists = check_alerts_definition(alert_name)
|
|
|
|
if not alert_exists:
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, alert="{alert_name}" not found in saved searches, will be removed from tenant_alert_objects'
|
|
)
|
|
alerts.remove(alert_name)
|
|
if not alerts_were_removed:
|
|
alerts_were_removed = True
|
|
|
|
# save the updated tenant_alert_objects
|
|
if alerts_were_removed:
|
|
tenant_alert_objects["alerts"] = alerts
|
|
|
|
# save the updated tenant_alert_objects
|
|
vtenant_record["tenant_alert_objects"] = json.dumps(
|
|
tenant_alert_objects, indent=2
|
|
)
|
|
|
|
try:
|
|
self.service.kvstore["kv_trackme_virtual_tenants"].data.update(
|
|
str(vtenant_record["_key"]), json.dumps(vtenant_record)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: logical_groups
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_logical_groups"
|
|
task_start = time.time()
|
|
|
|
#
|
|
# Verify Logical Groups:
|
|
# - load the logical groups KVstore collection
|
|
# - verify that for each member of the groups, the member can be found in in any of the dsm/dhm/mhm/flx/fqm KVstore collection as an actively monitoreed entity
|
|
# - if not, purge the member from the group
|
|
#
|
|
|
|
def query_kvstore_for_object(member, collection_suffix):
|
|
target_collection_name = (
|
|
f"kv_trackme_{collection_suffix}_tenant_{self.tenant_id}"
|
|
)
|
|
target_collection = self.service.kvstore[target_collection_name]
|
|
query_string = {
|
|
"$and": [
|
|
{
|
|
"object": member,
|
|
"monitored_state": "enabled",
|
|
}
|
|
]
|
|
}
|
|
try:
|
|
kvrecord = target_collection.data.query(query=json.dumps(query_string))[
|
|
0
|
|
]
|
|
kvrecord_key = kvrecord.get("_key", None)
|
|
except:
|
|
kvrecord_key = None
|
|
|
|
if kvrecord_key:
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, member="{member}", found in KVstore collection="{target_collection_name}"'
|
|
)
|
|
return True
|
|
|
|
return False
|
|
|
|
if (
|
|
vtenant_record.get("tenant_dsm_enabled") == True
|
|
or vtenant_record.get("tenant_dhm_enabled") == True
|
|
or vtenant_record.get("tenant_mhm_enabled") == True
|
|
or vtenant_record.get("tenant_flx_enabled") == True
|
|
or vtenant_record.get("tenant_fqm_enabled") == True
|
|
):
|
|
# log start
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting to verify logical groups, any orphan logical group member will be purged automatically.'
|
|
)
|
|
|
|
# time counter
|
|
logical_group_check_start = time.time()
|
|
|
|
#
|
|
# Logical groups collection records
|
|
#
|
|
|
|
logical_group_coll = self.service.kvstore[
|
|
f"kv_trackme_common_logical_group_tenant_{self.tenant_id}"
|
|
]
|
|
|
|
(
|
|
logical_groups_coll_records,
|
|
logical_groups_by_group_key_dict,
|
|
logical_groups_by_group_name_list,
|
|
logical_groups_by_member_dict,
|
|
logical_groups_by_member_list,
|
|
) = get_logical_groups_collection_records(logical_group_coll)
|
|
|
|
# log all returned from the function
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_groups_coll_records={json.dumps(logical_groups_coll_records, indent=2)}, logical_groups_by_group_key_dict={json.dumps(logical_groups_by_group_key_dict, indent=2)}, logical_groups_by_group_name_list={json.dumps(logical_groups_by_group_name_list, indent=2)}, logical_groups_by_member_dict={json.dumps(logical_groups_by_member_dict, indent=2)}, logical_groups_by_member_list={json.dumps(logical_groups_by_member_list, indent=2)}'
|
|
)
|
|
|
|
# loops through logical_groups_by_member_list if not empty, then check in each KVstore collection if we have a match
|
|
logical_member_found = False
|
|
logical_members_orphans = []
|
|
|
|
# ensure logical_groups_by_member_list is a list
|
|
if isinstance(logical_groups_by_member_list, str):
|
|
logical_groups_by_member_list = [logical_groups_by_member_list]
|
|
|
|
if len(logical_groups_by_member_list) > 0:
|
|
|
|
#
|
|
# Orphans
|
|
#
|
|
|
|
for member in logical_groups_by_member_list:
|
|
|
|
for tenant_setting, collection_suffix in [
|
|
("tenant_dsm_enabled", "dsm"),
|
|
("tenant_dhm_enabled", "dhm"),
|
|
("tenant_mhm_enabled", "mhm"),
|
|
("tenant_flx_enabled", "flx"),
|
|
("tenant_fqm_enabled", "fqm"),
|
|
]:
|
|
if vtenant_record.get(tenant_setting) == True:
|
|
logical_member_found = query_kvstore_for_object(
|
|
member, collection_suffix
|
|
)
|
|
if logical_member_found:
|
|
break
|
|
if not logical_member_found:
|
|
logical_members_orphans.append(member)
|
|
|
|
# log orphans
|
|
logging.debug(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_members_orphans={json.dumps(logical_members_orphans, indent=2)}'
|
|
)
|
|
|
|
# purge orphans
|
|
if len(logical_members_orphans) > 0:
|
|
# turn the list into a comma separated string
|
|
logical_members_orphans = ",".join(logical_members_orphans)
|
|
|
|
try:
|
|
logical_group_purge_remove_response = (
|
|
logical_group_remove_object_from_groups(
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
logical_members_orphans,
|
|
)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, orphan_members="{logical_members_orphans}", successfully purged the logical groups collection, response="{json.dumps(logical_group_purge_remove_response, indent=2)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, orphan_members="{logical_members_orphans}", failed to purge from the logical groups collection, exception={str(e)}'
|
|
)
|
|
|
|
#
|
|
# empty groups
|
|
#
|
|
|
|
for logical_group_record in logical_groups_coll_records:
|
|
|
|
# get the group name
|
|
object_group_name = logical_group_record.get("object_group_name")
|
|
|
|
# get the members
|
|
members = logical_group_record.get("object_group_members", None)
|
|
if members:
|
|
if not len(members) > 0:
|
|
members = None
|
|
|
|
if not members:
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", group has no members, will be purged.'
|
|
)
|
|
|
|
try:
|
|
logical_group_delete_response = (
|
|
logical_group_delete_group_by_name(
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self._metadata.searchinfo.session_key,
|
|
self.tenant_id,
|
|
object_group_name,
|
|
)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", group has been purged successfully, response="{json.dumps(logical_group_delete_response, indent=2)}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, group="{object_group_name}", failed to purge the group, exception={str(e)}'
|
|
)
|
|
|
|
# log time
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, logical_groups_check_duration="{round(time.time() - logical_group_check_start, 3)}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: check_trackers
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_trackers_statuses"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# Set the query
|
|
health_search = remove_leading_spaces(
|
|
f"""
|
|
| trackme mode=post url=/services/trackme/v2/configuration/get_tenant_ops_status body=\"{{'mode': 'raw', 'tenant_id': '{self.tenant_id}'}}\"
|
|
| trackmeopsstatusexpand
|
|
"""
|
|
)
|
|
|
|
# logging
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, tenant_id="{self.tenant_id}", Starting health_search, report="{report_name}", search="{health_search}"'
|
|
)
|
|
|
|
# kwargs
|
|
kwargs_oneshot = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
# run the main report, every result is a Splunk search to be executed on its own thread
|
|
try:
|
|
reader = run_splunk_search(
|
|
self.service,
|
|
health_search,
|
|
kwargs_oneshot,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
# Call the component register
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
"all",
|
|
report_name,
|
|
"success",
|
|
time.time(),
|
|
str(time.time() - start),
|
|
"The report was executed successfully",
|
|
"-5m",
|
|
"now",
|
|
)
|
|
|
|
for item in reader:
|
|
if isinstance(item, dict):
|
|
# verify the knowledge object - if for some reason it is not existing anymore, we should remove it
|
|
# and not take it into account any longer
|
|
|
|
# process
|
|
savedsearch_definition = None
|
|
report_name = item.get("report")
|
|
try:
|
|
savedsearch = self.service.saved_searches[report_name]
|
|
savedsearch_definition = savedsearch.content["search"]
|
|
savedsearch_content = savedsearch.content
|
|
except Exception as e:
|
|
savedsearch_definition = None
|
|
savedsearch_content = {}
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the report="{report_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.'
|
|
)
|
|
|
|
if not savedsearch_definition:
|
|
# extract component
|
|
component = report_name.split("_")[1]
|
|
|
|
# purge
|
|
try:
|
|
delete_register_summary = (
|
|
trackme_delete_tenant_object_summary(
|
|
self._metadata.searchinfo.session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
f"splk-{component}",
|
|
report_name,
|
|
)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, knowledge for the report="{report_name}" was purged successfully, response="{delete_register_summary}"'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while calling function trackme_delete_tenant_object_summary, exception="{str(e)}"'
|
|
)
|
|
|
|
else:
|
|
|
|
search_component = item.get("component")
|
|
search_cron_schedule = savedsearch_content.get("cron_schedule")
|
|
search_description = savedsearch_content.get("description")
|
|
search_earliest = savedsearch_content.get(
|
|
"dispatch.earliest_time"
|
|
)
|
|
search_last_duration = item.get("last_duration")
|
|
search_last_exec = item.get("last_exec")
|
|
search_last_result = item.get("last_result")
|
|
search_last_status = item.get("last_status")
|
|
search_latest = savedsearch_content.get("dispatch.latest_time")
|
|
search_report_name = report_name
|
|
search_schedule_window = savedsearch_content.get(
|
|
"schedule_window"
|
|
)
|
|
search_tenant_id = item.get("tenant_id")
|
|
search_workload_pool = savedsearch_content.get(
|
|
"workload_pool", None
|
|
)
|
|
|
|
# ACLs
|
|
acl_report_info = None
|
|
if self.get_acl:
|
|
# try to get acl
|
|
acl_link = savedsearch.links["alternate"]
|
|
acl_report_info = {}
|
|
acl_url = f"{self._metadata.searchinfo.splunkd_uri}{acl_link}/acl/list?output_mode=json"
|
|
|
|
try:
|
|
response = session.get(
|
|
acl_url,
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
response_json = response.json()
|
|
response.raise_for_status()
|
|
acl_properties = response_json["entry"][0].get(
|
|
"acl", {}
|
|
)
|
|
acl_report_info = {
|
|
"eai:acl.owner": acl_properties.get("owner"),
|
|
"eai:acl.perms.read": acl_properties["perms"][
|
|
"read"
|
|
],
|
|
"eai:acl.perms.write": acl_properties["perms"][
|
|
"write"
|
|
],
|
|
"eai:acl.sharing": acl_properties.get("sharing"),
|
|
}
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while trying to get the ACL for the report="{report_name}", exception="{str(e)}"'
|
|
)
|
|
|
|
# set info record
|
|
search_info_record = {
|
|
"component": search_component,
|
|
"cron_schedule": search_cron_schedule,
|
|
"description": search_description,
|
|
"earliest": search_earliest,
|
|
"last_duration": search_last_duration,
|
|
"last_exec": search_last_exec,
|
|
"last_result": search_last_result,
|
|
"last_status": search_last_status,
|
|
"latest": search_latest,
|
|
"report": search_report_name,
|
|
"schedule_window": search_schedule_window,
|
|
"tenant_id": search_tenant_id,
|
|
}
|
|
|
|
# most often the workload pool is not set, only add if explicitly set
|
|
if search_workload_pool:
|
|
search_info_record["workload_pool"] = search_workload_pool
|
|
|
|
# add acl info
|
|
if acl_report_info:
|
|
search_info_record.update(acl_report_info)
|
|
|
|
yield {
|
|
"_time": time.time(),
|
|
"_raw": search_info_record,
|
|
"component": search_component,
|
|
"cron_schedule": search_cron_schedule,
|
|
"description": search_description,
|
|
"earliest": search_earliest,
|
|
"last_duration": search_last_duration,
|
|
"last_exec": search_last_exec,
|
|
"last_result": search_last_result,
|
|
"last_status": search_last_status,
|
|
"latest": search_latest,
|
|
"report": search_report_name,
|
|
"schedule_window": search_schedule_window,
|
|
"tenant_id": search_tenant_id,
|
|
"workload_pool": search_workload_pool,
|
|
}
|
|
|
|
# index the audit record
|
|
try:
|
|
trackme_state_event(
|
|
session_key=self._metadata.searchinfo.session_key,
|
|
splunkd_uri=self._metadata.searchinfo.splunkd_uri,
|
|
tenant_id=self.tenant_id,
|
|
index=tenant_indexes["trackme_audit_idx"],
|
|
sourcetype="trackme:health",
|
|
source=f"trackme:health:{self.tenant_id}",
|
|
record=search_info_record,
|
|
)
|
|
except Exception as e:
|
|
error_msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, exception encountered while calling function trackme_state_event, exception="{str(e)}"'
|
|
logging.error(error_msg)
|
|
raise Exception(error_msg)
|
|
|
|
except Exception as e:
|
|
# Call the component register
|
|
trackme_register_tenant_object_summary(
|
|
session_key,
|
|
self._metadata.searchinfo.splunkd_uri,
|
|
self.tenant_id,
|
|
"all",
|
|
report_name,
|
|
"failure",
|
|
time.time(),
|
|
str(time.time() - start),
|
|
str(e),
|
|
"-5m",
|
|
"now",
|
|
)
|
|
msg = f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, main search failed with exception="{str(e)}"'
|
|
logging.error(msg)
|
|
raise Exception(msg)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: check_tenant_record_knowledge_objects
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "check_tenant_record_knowledge_objects"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# logic:
|
|
# For each component, check the field tenant_<component>_hybrid_objects from the vtenant record
|
|
# load the object as json, get the list reports and the list macros
|
|
# for each object, check that it actually exists in Splunk
|
|
# if not, delete the object from the vtenant record
|
|
|
|
for component in ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]:
|
|
|
|
# if the component is disabled, skip
|
|
try:
|
|
component_enablement = int(vtenant_record.get(f"tenant_{component}_enabled", 0))
|
|
except Exception as e:
|
|
component_enablement = 0
|
|
|
|
if component_enablement == 0:
|
|
continue
|
|
|
|
# get the hybrid_objects field
|
|
hybrid_objects = vtenant_record.get(
|
|
f"tenant_{component}_hybrid_objects"
|
|
)
|
|
|
|
try:
|
|
hybrid_objects = json.loads(hybrid_objects)
|
|
except Exception as e:
|
|
hybrid_objects = {}
|
|
|
|
# if the field does not exist, skip
|
|
if not hybrid_objects:
|
|
continue
|
|
|
|
# if "reports" is in the list, get the list of reports
|
|
if "reports" in hybrid_objects:
|
|
reports = hybrid_objects.get("reports")
|
|
else:
|
|
reports = []
|
|
|
|
# if "macros" is in the list, get the list of macros
|
|
if "macros" in hybrid_objects:
|
|
macros = hybrid_objects.get("macros")
|
|
else:
|
|
macros = []
|
|
|
|
# check reports
|
|
if reports:
|
|
for report_name in reports:
|
|
|
|
# process
|
|
savedsearch_definition = None
|
|
try:
|
|
savedsearch = self.service.saved_searches[report_name]
|
|
savedsearch_definition = savedsearch.content["search"]
|
|
savedsearch_content = savedsearch.content
|
|
except Exception as e:
|
|
savedsearch_definition = None
|
|
savedsearch_content = {}
|
|
|
|
# purge if necessary
|
|
if not savedsearch_definition:
|
|
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the report="{report_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.'
|
|
)
|
|
|
|
# remove from list in hybrid_objects, udate the vetant record and update the KVstore collection
|
|
reports.remove(report_name)
|
|
hybrid_objects["reports"] = reports
|
|
vtenant_record[f"tenant_{component}_hybrid_objects"] = (
|
|
json.dumps(hybrid_objects, indent=2)
|
|
)
|
|
|
|
try:
|
|
self.service.kvstore[
|
|
"kv_trackme_virtual_tenants"
|
|
].data.update(
|
|
str(vtenant_record["_key"]), json.dumps(vtenant_record)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
|
|
)
|
|
|
|
# check macros
|
|
if macros:
|
|
for macro_name in macros:
|
|
# process
|
|
macro_definition = None
|
|
try:
|
|
macro = self.service.confs["macros"][macro_name]
|
|
macro_definition = macro.content["definition"]
|
|
except Exception as e:
|
|
macro = None
|
|
macro_definition = None
|
|
|
|
# purge if necessary
|
|
if not macro_definition:
|
|
|
|
logging.warning(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, the macro="{macro_name}" does not exist anymore, somehow it was removed without TrackMe being aware of it, will get rid of this now.'
|
|
)
|
|
|
|
# remove from list in hybrid_objects, udate the vetant record and update the KVstore collection
|
|
macros.remove(macro_name)
|
|
hybrid_objects["macros"] = macros
|
|
vtenant_record[f"tenant_{component}_hybrid_objects"] = (
|
|
json.dumps(hybrid_objects, indent=2)
|
|
)
|
|
|
|
try:
|
|
self.service.kvstore[
|
|
"kv_trackme_virtual_tenants"
|
|
].data.update(
|
|
str(vtenant_record["_key"]), json.dumps(vtenant_record)
|
|
)
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, vtenant_record updated successfully.'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, Failed to update vtenant_record, exception: {str(e)}'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: gen_sla_breaches_events
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "gen_sla_breaches_events"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# get sla gen events frequency
|
|
try:
|
|
sla_breaches_events_frequency = int(
|
|
reqinfo["trackme_conf"]["sla"]["sla_breaches_events_frequency"]
|
|
)
|
|
except Exception as e:
|
|
sla_breaches_events_frequency = 86400
|
|
|
|
def process_sla_breaches_component(component, sla_breaches_events_frequency):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, component="splk-{component}", task="{task_name}", task_instance_id={task_instance_id}, processing SLA breaches.'
|
|
)
|
|
|
|
# Get the KVstore collection for SLA notifications
|
|
collection_name = (
|
|
f"kv_trackme_{component}_sla_notifications_tenant_{self.tenant_id}"
|
|
)
|
|
collection = self.service.kvstore[collection_name]
|
|
|
|
# Run the search to get objects with SLA breaches
|
|
search_string = f'| trackmegetcoll tenant_id="{self.tenant_id}" component="{component}" | where monitored_state="enabled" | table alias object object_category object_state priority keyid sla_* anomaly_reason status_message | where sla_is_breached=1'
|
|
|
|
# kwargs
|
|
kwargs_search = {
|
|
"earliest_time": "-5m",
|
|
"latest_time": "now",
|
|
"preview": "false",
|
|
"output_mode": "json",
|
|
"count": 0,
|
|
}
|
|
|
|
try:
|
|
search_results = run_splunk_search(
|
|
self.service,
|
|
search_string,
|
|
kwargs_search,
|
|
24,
|
|
5,
|
|
)
|
|
|
|
for item in search_results:
|
|
if isinstance(item, dict):
|
|
try:
|
|
# Extract required fields
|
|
alias = item.get("alias")
|
|
object_value = item.get("object")
|
|
object_category = item.get("object_category")
|
|
object_state = item.get("object_state")
|
|
priority = item.get("priority")
|
|
keyid = item.get("keyid")
|
|
anomaly_reason = item.get("anomaly_reason")
|
|
status_message = item.get("status_message")
|
|
sla_class = item.get("sla_class")
|
|
sla_is_breached = item.get("sla_is_breached")
|
|
sla_message = item.get("sla_message")
|
|
sla_threshold = item.get("sla_threshold")
|
|
sla_threshold_duration = item.get("sla_threshold_duration")
|
|
sla_timer = item.get("sla_timer")
|
|
sla_timer_duration = item.get("sla_timer_duration")
|
|
|
|
# Check if we have a notification record for this object
|
|
query_string = {"_key": keyid}
|
|
try:
|
|
kvrecord = collection.data.query(
|
|
query=json.dumps(query_string)
|
|
)[0]
|
|
last_notification_time = float(kvrecord.get("mtime", 0))
|
|
current_time = time.time()
|
|
|
|
# Only generate event if last notification was > 24 hours ago
|
|
if (
|
|
current_time - last_notification_time
|
|
> sla_breaches_events_frequency
|
|
):
|
|
should_generate_event = True
|
|
else:
|
|
should_generate_event = False
|
|
except Exception:
|
|
# No record exists, we should generate an event
|
|
should_generate_event = True
|
|
last_notification_time = 0
|
|
|
|
if should_generate_event:
|
|
# Create the SLA breach event record
|
|
breach_record = {
|
|
"timeStr": time.strftime(
|
|
"%d/%m/%Y %H:%M:%S", time.localtime(time.time())
|
|
),
|
|
"tenant_id": self.tenant_id,
|
|
"alias": alias,
|
|
"object": decode_unicode(object_value),
|
|
"keyid": keyid,
|
|
"object_category": object_category,
|
|
"object_state": object_state,
|
|
"priority": priority,
|
|
"anomaly_reason": anomaly_reason,
|
|
"status_message": status_message,
|
|
"sla_class": sla_class,
|
|
"sla_is_breached": sla_is_breached,
|
|
"sla_message": sla_message,
|
|
"sla_threshold": sla_threshold,
|
|
"sla_threshold_duration": sla_threshold_duration,
|
|
"sla_timer": sla_timer,
|
|
"sla_timer_duration": sla_timer_duration,
|
|
}
|
|
|
|
# Add event_id
|
|
breach_record["event_id"] = hashlib.sha256(
|
|
json.dumps(breach_record).encode()
|
|
).hexdigest()
|
|
|
|
# Generate the event
|
|
try:
|
|
trackme_gen_state(
|
|
index=tenant_indexes["trackme_summary_idx"],
|
|
sourcetype="trackme:sla_breaches",
|
|
source=f"health_tracker:{task_name}",
|
|
event=breach_record,
|
|
)
|
|
logging.info(
|
|
f'TrackMe SLA breach event created successfully, tenant_id="{self.tenant_id}", sla_gen_events_frequency="{sla_breaches_events_frequency}", record="{json.dumps(breach_record, indent=1)}"'
|
|
)
|
|
|
|
# Update or create the notification record
|
|
notification_record = {
|
|
"_key": keyid,
|
|
"mtime": time.time(),
|
|
"last_notification": breach_record,
|
|
}
|
|
|
|
try:
|
|
collection.data.update(
|
|
keyid, json.dumps(notification_record)
|
|
)
|
|
except Exception:
|
|
collection.data.insert(
|
|
json.dumps(notification_record)
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", object="{object_value}", failed to generate a SLA breach event with exception="{e}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", failed to process record with exception="{e}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", component="splk-{component}", failed to run SLA breaches search with exception="{e}"'
|
|
)
|
|
|
|
# Main logic
|
|
components = ["dsm", "dhm", "mhm", "flx", "wlk", "fqm"]
|
|
if (
|
|
sla_breaches_events_frequency > 0
|
|
): # only run if the frequency is greater than 0
|
|
for component in components:
|
|
if vtenant_record.get(f"tenant_{component}_enabled"):
|
|
process_sla_breaches_component(
|
|
component, sla_breaches_events_frequency
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: unclosed_stateful_incidents
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "unclosed_stateful_incidents"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
# objective: get any opened or updated incidents in the KVstore, verify that:
|
|
# - the entity associated with the incident is still existing and with monitored_state="enabled", if not the incident will be updated and closed.
|
|
# - if the entity exists, but is in a non alerting state (green, blue), and the incident is older than 24 hours, the incident will be updated and closed.
|
|
|
|
# get the KVstore collection for stateful incidents
|
|
stateful_incidents_collection_name = (
|
|
f"kv_trackme_stateful_alerting_tenant_{self.tenant_id}"
|
|
)
|
|
stateful_incidents_collection = self.service.kvstore[
|
|
stateful_incidents_collection_name
|
|
]
|
|
|
|
def get_stateful_incidents(collection_name, collection):
|
|
|
|
collection_records = []
|
|
collection_records_keys = set()
|
|
collection_dict = {}
|
|
|
|
try:
|
|
end = False
|
|
skip_tracker = 0
|
|
while end == False:
|
|
process_collection_records = collection.data.query(
|
|
skip=skip_tracker
|
|
)
|
|
if len(process_collection_records) != 0:
|
|
for item in process_collection_records:
|
|
if item.get("_key") not in collection_records_keys:
|
|
if item.get("alert_status") in ["opened", "updated"]:
|
|
collection_records.append(item)
|
|
collection_records_keys.add(item.get("object"))
|
|
collection_dict[item.get("object")] = item
|
|
skip_tracker += 500
|
|
else:
|
|
end = True
|
|
|
|
return collection_records, collection_records_keys, collection_dict
|
|
|
|
except Exception as e:
|
|
raise Exception(str(e))
|
|
|
|
# get the stateful incidents
|
|
try:
|
|
(
|
|
stateful_incidents_records,
|
|
stateful_incidents_keys,
|
|
stateful_incidents_dict,
|
|
) = get_stateful_incidents(
|
|
stateful_incidents_collection_name, stateful_incidents_collection
|
|
)
|
|
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to call get_kv_collection, args={stateful_incidents_collection_name}, cannot process this task, exception="{str(e)}"'
|
|
)
|
|
stateful_incidents_records = []
|
|
stateful_incidents_keys = set()
|
|
stateful_incidents_dict = {}
|
|
|
|
# iterate through opened or updated incidents
|
|
for stateful_incident in stateful_incidents_records:
|
|
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, inspecting stateful with _key="{stateful_incident.get("_key")}", incident="{stateful_incident}"'
|
|
)
|
|
|
|
# get the object
|
|
stateful_object = stateful_incident.get("object")
|
|
|
|
# get the object_id
|
|
stateful_object_id = stateful_incident.get("object_id")
|
|
|
|
# get the object_category (ex: splk-dsm)
|
|
stateful_object_category = stateful_incident.get("object_category")
|
|
|
|
# get the object_state
|
|
stateful_object_state = stateful_incident.get("object_state")
|
|
|
|
# get the object status
|
|
stateful_object_status = stateful_incident.get("object_status")
|
|
|
|
# get the mtime
|
|
stateful_incident_mtime = float(stateful_incident.get("mtime"))
|
|
|
|
# calculate the incident duration
|
|
stateful_incident_duration = time.time() - stateful_incident_mtime
|
|
|
|
# access the data KVstore collection
|
|
object_category_suffix = stateful_object_category.split("-")[1]
|
|
data_collection_name = (
|
|
f"kv_trackme_{object_category_suffix}_tenant_{self.tenant_id}"
|
|
)
|
|
data_collection = self.service.kvstore[data_collection_name]
|
|
|
|
# get the object from the data collection
|
|
try:
|
|
data_object = data_collection.data.query(
|
|
query=json.dumps({"_key": stateful_object_id})
|
|
)[0]
|
|
except Exception as e:
|
|
data_object = None
|
|
|
|
# use-case 1: the object does not exist anymore
|
|
stateful_object_exists = True
|
|
|
|
if not data_object:
|
|
stateful_object_exists = False
|
|
|
|
# use-case 2: the object exists, but is in a non alerting state while the incident has not been closed 24 hours later
|
|
stateful_incident_outdated = False
|
|
|
|
if stateful_object_exists:
|
|
if data_object.get("object_state", "green") in ["green", "blue"]:
|
|
if stateful_incident_duration > 86400:
|
|
stateful_incident_outdated = True
|
|
elif data_object.get("monitored_state") != "enabled":
|
|
stateful_incident_outdated = True
|
|
|
|
# Update the incident if necessary
|
|
if not stateful_object_exists or stateful_incident_outdated:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, update of outdated stateful incident is required, stateful_object_exists="{stateful_object_exists}", stateful_incident_outdated="{stateful_incident_outdated}", incident="{stateful_incident}"'
|
|
)
|
|
|
|
# update the incident
|
|
stateful_incident["alert_status"] = "closed"
|
|
stateful_incident["mtime"] = time.time()
|
|
|
|
if stateful_object_exists:
|
|
stateful_incident["object_state"] = stateful_object_status
|
|
|
|
# update the incident in the KVstore
|
|
try:
|
|
stateful_incidents_collection.data.update(
|
|
stateful_incident.get("_key"), json.dumps(stateful_incident)
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, failed to update stateful incident with exception="{e}"'
|
|
)
|
|
|
|
else:
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, no action required against incident with _key="{stateful_incident.get("_key")}", stateful_object_exists="{stateful_object_exists}", stateful_incident_outdated="{stateful_incident_outdated}"'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
#
|
|
# task: apply_licensing_restrictions
|
|
#
|
|
|
|
task_instance_id = self.get_uuid()
|
|
task_name = "apply_licensing_restrictions"
|
|
task_start = time.time()
|
|
|
|
# start task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, starting task.'
|
|
)
|
|
|
|
#
|
|
# licensing restriction
|
|
#
|
|
|
|
# if the component is a restricted component and the product is not registered, it should be disabled now
|
|
if license_is_valid == 0 and (
|
|
vtenant_record.get("tenant_flx_enabled") == 1
|
|
or vtenant_record.get("tenant_fqm_enabled") == 1
|
|
or vtenant_record.get("tenant_wlk_enabled") == 1
|
|
):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is running a restricted component while this instance is not registered'
|
|
)
|
|
|
|
# target
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant"
|
|
|
|
# data
|
|
post_data = {
|
|
"tenant_id": self.tenant_id,
|
|
"update_comment": "Auto disabling this tenant due to licensing limitation, the tenant is running a restricted component while the product is not currently registered",
|
|
"force": "true",
|
|
}
|
|
|
|
try:
|
|
response = session.post(
|
|
target_url,
|
|
data=json.dumps(post_data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
return json.loads(response.text)
|
|
|
|
except Exception as e:
|
|
raise Exception(
|
|
f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"'
|
|
)
|
|
|
|
elif (
|
|
license_is_valid == 0
|
|
and license_active_tenants > 2
|
|
and self.tenant_id not in license_active_tenants_list[0:2]
|
|
):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is running a restricted component while this instance is not registered'
|
|
)
|
|
|
|
# target
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant"
|
|
|
|
# data
|
|
post_data = {
|
|
"tenant_id": self.tenant_id,
|
|
"update_comment": f"Auto disabling this tenant due to licensing limitation, this deployment has reached the maximum number of tenants allowed ({license_active_tenants}), only the following tenants can be used: {license_active_tenants_list[0:2]}",
|
|
"force": "true",
|
|
}
|
|
|
|
try:
|
|
response = session.post(
|
|
target_url,
|
|
data=json.dumps(post_data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
return json.loads(response.text)
|
|
|
|
except Exception as e:
|
|
raise Exception(
|
|
f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"'
|
|
)
|
|
|
|
elif (
|
|
license_is_valid == 1
|
|
and license_subscription_class == "enterprise"
|
|
and license_active_tenants > 6
|
|
and self.tenant_id not in license_active_tenants_list[0:6]
|
|
):
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, due to licensing restrictions, this tenant will be automatically disabled, the tenant is over the maximum number of allowed tenants in Enterprise Edition'
|
|
)
|
|
|
|
# target
|
|
target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/vtenants/admin/disable_tenant"
|
|
|
|
# data
|
|
post_data = {
|
|
"tenant_id": self.tenant_id,
|
|
"update_comment": f"Auto disabling this tenant due to licensing limitation, this deployment has reached the maximum number of tenants allowed ({license_active_tenants}), only the following tenants can be used: {license_active_tenants_list[0:6]}",
|
|
"force": "true",
|
|
}
|
|
|
|
try:
|
|
response = session.post(
|
|
target_url,
|
|
data=json.dumps(post_data),
|
|
verify=False,
|
|
timeout=600,
|
|
)
|
|
return json.loads(response.text)
|
|
|
|
except Exception as e:
|
|
raise Exception(
|
|
f'An exception was encountered while attempting to disable the tenant due to licensing restrictions, exception="{str(e)}"'
|
|
)
|
|
|
|
# An exception was raised while attempting to validate the license
|
|
# Log the error but do nothing
|
|
elif license_is_valid == 2:
|
|
logging.error(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, an exception was raised while attempting to validate the license, no actions will be taken for now.'
|
|
)
|
|
|
|
# end task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, task="{task_name}", task_instance_id={task_instance_id}, run_time="{round(time.time()-task_start, 2)}", task has terminated.'
|
|
)
|
|
|
|
# end general task
|
|
logging.info(
|
|
f'tenant_id="{self.tenant_id}", instance_id={instance_id}, trackmetrackerhealth has terminated, total_run_time={round(time.time() - start, 3)}'
|
|
)
|
|
|
|
|
|
dispatch(HealthTracker, sys.argv, sys.stdin, sys.stdout, __name__)
|