# Copyright (C) 2005-2024 Splunk Inc. All Rights Reserved. """ This is a re-implementation of the refresh queue that we had before, with the major change that this one is designed to be run concurrently. Itself, it will behave appropriately. However, it is paramount that any new/existing refresh queues be audited for the amount of risk that they can have. Currently we have no transaction support from our supporting datastore, so theres definitely a risk of race conditions. """ import json import os import sys from builtins import object from multiprocessing.pool import ThreadPool from time import time from splunk.clilib.bundle_paths import make_splunkhome_path from splunk.rest import simpleRequest from splunk import LicenseRestriction sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib'])) sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib', 'SA_ITOA_app_common'])) import itsi_path from SA_ITOA_app_common.solnlib.modular_input import ModularInput from ITOA import itoa_refresh_queue_utils from ITOA.itoa_common import modular_input_should_run from ITOA.mod_input_utils import skip_run_during_migration from ITOA.setup_logging import logger, InstrumentCall, getLogger4ModInput from itsi.objects.changehandlers.handler_manifest import handler_manifest from itsi.objects.itsi_refresh_queue_job import ItsiRefreshQueueJob from kvstore_queue.kvstore_queue_consumer import KVStoreQueueConsumer _instrumentation = InstrumentCall(logger) # Flag used for generating the restartless testing logs only once. modular_input_reloaded = False DEFAULT_POOL_SIZE = 8 DEFAULT_QUEUE_SIZE_LOGGING_FREQUENCY = 600 # Minimum number of seconds between attempts to log the size of the queue class JobDemultiplexer(object): """ De-Multiplexer for ITSI objects. A user passes in a field that will select what action to run - the demultiplexer field (demux_field) Essentially the job type, but can be used for other things """ def __init__(self, session_key, demux_field, number_of_thread=DEFAULT_POOL_SIZE): """ Initialize the process object @param session_key: The splunkd session key @param demux_field: The field in the job data to demultiplex on @param number_of_thread: number of thread to allocate - must be a none-zero value """ self.demux_field = demux_field self.session_key = session_key self.handler_manifest = handler_manifest if number_of_thread < 1: number_of_thread = DEFAULT_POOL_SIZE self.thread_pool = ThreadPool(processes=number_of_thread) self.refresh_queue_job_interface = ItsiRefreshQueueJob(self.session_key, 'nobody') self.last_size_logging_time = None def preprocess_job(self, job_data, other_jobs): """ If the handler is configured to, check for duplicate jobs @param job_data: job object the handler is currently working on @param other_jobs: list of job objects in the queue @return: list of duplicate job objects @rtype List """ job_changed_object_type = job_data.get("changed_object_type") job_change_key = job_data.get("changed_object_key") if job_changed_object_type and job_change_key: selector = job_data.get(self.demux_field, None) handler_object = self._get_handler(job_data, selector)(logger, self.session_key, self.thread_pool) should_remove_duplicates = getattr(handler_object, 'should_remove_duplicates', None) if should_remove_duplicates and callable(should_remove_duplicates) and should_remove_duplicates(job_data): duplicates = [job for job in other_jobs if job.get("changed_object_type") == job_changed_object_type and job.get("changed_object_key") == job_change_key] return duplicates return [] def process_job(self, job_data): """ Using the demultipler field, process the job_data @param job_data: JSON data coming in to be processed. In this case, it will be a refresh queue job @type job_data: Dictionary @return: Was this job successful? @rtype: Boolean """ start_time = time() create_time = float(job_data.get("create_time")) job_change_type = job_data.get("change_type") job_change_key = job_data.get("changed_object_key", "Unknown Key") transaction_id = job_data.get("transaction_id", None) job_changed_object_type = job_data.get("changed_object_type", "Unknown Change Object Type") try: selector = job_data.get(self.demux_field, None) handler_object = self._get_handler(job_data, selector)(logger, self.session_key, self.thread_pool) handler_object.assert_valid_change_object(job_data) method_name = selector + ".deferred" with _instrumentation.track(method_name, transaction_id): itoa_refresh_queue_utils.process_map[os.getpid()] = job_data.get('_key') successful = handler_object.deferred(job_data, job_data.get('transaction_id')) except Exception: successful = False logger.exception("Error processing job=%s change_type=%s tid=%s", job_data.get("_key"), job_data.get("change_type"), job_data.get("transaction_id")) raise finally: # Log the entire experience job_key = job_data.get("_key") end_time = time() job_time = end_time - start_time queue_time = job_data.get('queue_time') overall_time = end_time - create_time completion = "Successful" if not successful: completion = "Failed" number_of_failures = job_data.get('number_of_failures', 0) parent_job = job_data.get('job_data', '') logFormat = "Transaction: Job %s: job_key=%s tid=%s job_change_type=%s job_changed_object_type=%s " \ "start_time=%s end_time=%s job_time=%s queue_time=%s transaction_time=%s job_change_key=%s " \ "create_time=%s handler_object=%s number_of_failures=%s parent_job=%s " current_time = time() if self.last_size_logging_time is None \ or current_time - self.last_size_logging_time > DEFAULT_QUEUE_SIZE_LOGGING_FREQUENCY: self.last_size_logging_time = current_time current_queue_size = len(self.refresh_queue_job_interface.get_bulk('nobody', fields=['_key', 'transaction_id'])) logFormat += "current_queue_size=%s" logger.info( logFormat, completion, job_key, transaction_id, job_change_type, job_changed_object_type, start_time, end_time, job_time, queue_time, overall_time, job_change_key, create_time, type(handler_object).__name__, number_of_failures, parent_job, current_queue_size, ) else: logger.info( logFormat, completion, job_key, transaction_id, job_change_type, job_changed_object_type, start_time, end_time, job_time, queue_time, overall_time, job_change_key, create_time, type(handler_object).__name__, number_of_failures, parent_job, ) return successful def _get_handler(self, job_data, selector): """ Get the handler class for job @param job_data: the refresh queue job object @param selector: the demux field @return: Handler class """ key = job_data.get("_key") transaction_id = job_data.get('transaction_id') if selector is None: raise Exception("No selector found in data field=%s, key=%s, tid=%s" % (self.demux_field, key, transaction_id)) handler_class = self.handler_manifest.get(selector) if handler_class is None: raise Exception("No handler manifest could be found selector=%s, key=%s, tid=%s" % (selector, key, transaction_id)) return handler_class class ItsiConsumerModularInput(ModularInput): """ Modular input that processes job objects from the specified queue and collection """ # Required options for Modular Input title = "ITSI Multiple Job Queue Processor" description = "Runs deferred operations to ensure consistency for ITSI knowledge objects." app = "SA-ITOA" name = "itsi_consumer" use_single_instance = False use_kvstore_checkpointer = False use_hec_event_writer = False def extra_arguments(self): """ Additional argument definitions required for the modular input class """ return [{'name': "log_level", 'title': 'Logging Level', 'description': 'Logging level to use for logging errors (ERROR, WARNING, INFO, DEBUG)' } ] # It may seem a little strange to have these wrapped, but the good reason is # So that we can do good unit testing around this feature def request_shc_members(self): """ The reason that we split this up into a separate method is so that we can abstract everything else out and test it by overwriting this method """ return simpleRequest('/services/shcluster/member/members', sessionKey=self.session_key, getargs={"output_mode": "json"}, raiseAllErrors=False) # It may seem a little strange to have these wrapped, but the good reason is # So that we can do good unit testing around this feature def request_input_conf_entries(self): """ The reason that we split this up into a separate method is so that we can abstract everything else out and test it by overwriting this method """ return simpleRequest('/services/properties/inputs', sessionKey=self.session_key, getargs={"output_mode": "json"}, raiseAllErrors=False) def unclaim_incomplete_jobs(self, localhost): """ So, with multiple processing queues, what happens if one dies unexpectedly? We need to find a way to 1) Determine that the process has died 2) Adjust all of its jobs to unclaimed 3) Do this all within splunk. The worst part. For 1, what we need to do is first compare what hosts are up and which ones are not. Now, this is only possible as far as the different search heads share the same kvstore - which is only possible if they are clustered. That means that we can look through all of the hosts that we're informed of in our shc (or single instance) and see if there are any hosts in the list of jobs that are not in the list provided by splunk If there are any jobs assigned to hosts that don't exist, we put those back on the open market. Next, we look for jobs that are assigned to job processors that don't exist. Unfortunately, we can't check other hosts that well, but we can check our own host pretty easily. So lets do that. Once we have dead hosts and dead input stanzas covered, we've covered a large portion of the jobs. The only thing that isn't covered are hosts up that have 0 jobs assigned to them (usually through user intervention). We'll need a special way of dealing with that case For 2, thats easy. Just rewrite the job itself to take out the processing piece For 3, we'll be limited to conf files, the kvstore, etc. Not great, but whatever """ # Step 0: Are we a part of a shc? valid_hosts = [] try: response, content = self.request_shc_members() if response.status != 200: # Assume that we are not a part of the search head cluster, just grab localhost is_shc = False # noqa F842 valid_hosts.append(localhost) else: is_shc = True # noqa F842 parsed_content = json.loads(content) for entry in parsed_content.get("entry", []): entry_content = entry.get('content', {}) label = entry_content.get("label") status = entry_content.get("status") if label is not None and status == "Up": valid_hosts.append(label) except LicenseRestriction: logger.exception("Must update your license. Continuing with localhost awareness only.") valid_hosts.append(localhost) # Step 1: Check for any hosts that are not a part of the shc. # Find jobs that are assigned to the dead hosts valid_local_workers = [] response, content = self.request_input_conf_entries() if response.status == 200: # If we don't get status = 200, then we should skip any of the local workers parsed_content = json.loads(content) for entry in parsed_content.get("entry", []): entry_content = entry.get('name') if "itsi_consumer://" in entry_content: valid_local_workers.append(entry_content + ":" + localhost) # Step 2: Check for any jobs that are assigned to workers not present on this host. if not hasattr(self, "refresh_queue_job_interface"): self.refresh_queue_job_interface = ItsiRefreshQueueJob(self.session_key, 'nobody') queue_jobs = self.refresh_queue_job_interface.get_bulk('nobody') redo_jobs = [] for job in queue_jobs: processor = job.get("processor") if processor is None or len(processor) == 0: continue worker_host = processor[processor.rfind(":") + 1:] if worker_host not in valid_hosts: redo_jobs.append(job) elif processor not in valid_local_workers and worker_host == localhost: redo_jobs.append(job) # Individually save all of the jobs for job in redo_jobs: old_job = self.refresh_queue_job_interface.get('nobody', job['_key']) if old_job.get('processor') == job.get('processor'): # Step 3: For any jobs in 1 or 2, # nullify their workers (if they are not claimed by other workers) del old_job['processor'] self.refresh_queue_job_interface.update('nobody', job['_key'], old_job) @skip_run_during_migration def do_run(self, stanzas): """ Run the modular input """ if len(stanzas) == 0: # The feature is disabled, no stanzas are present. return logger = getLogger4ModInput(stanzas) # log the message for restartless upgrade testing which can be useful while debugging. global modular_input_reloaded if not modular_input_reloaded: stanza_name = next(iter(stanzas.keys())) logger.info(f"Restartless upgrade - Reloaded modular input {stanza_name}") modular_input_reloaded = True # We only want the first instance of the stanza, it has the name that we want if not modular_input_should_run(self.session_key, logger=logger): logger.info("Will not run itsi_consumer queue on this node") return stanza_name = next(iter(stanzas.keys())) stanza_config = next(iter(stanzas.values())) # A mostly unique identifier that will persist across different iterations # Here we're assuming that the stanza name itself is unique across stanza types # It could be a uuid though # Anything as long as it persists across reboots self.uuid = stanza_name + ":" + stanza_config.get('host') logger.debug("Running with uid=%s", self.uuid) level = stanza_config.get("log_level") if level is not None: level = level.upper() if level not in ["ERROR", "WARN", "WARNING", "INFO", "DEBUG"]: level = "INFO" logger.setLevel(level) try: number_of_thread = int(stanza_config.get('number_of_thread', DEFAULT_POOL_SIZE)) except Exception: number_of_thread = DEFAULT_POOL_SIZE logger.debug('number of thread is set to %s', number_of_thread) demux = JobDemultiplexer(self.session_key, 'change_type', number_of_thread) self.consumer = KVStoreQueueConsumer(self.session_key, logger, self.uuid, demux) # Next, wait for the kvstore to get up and running self.consumer.block_until_ready() start_time = time() while True: try: self.consumer.process_job() except Exception as e: if "Splunkd daemon is not responding: " in str(e): logger.warning('Connection issue while processing handler on uid=%s. "%s" If this message occurs ' 'only once, KV Store may still be initializing.', self.uuid, e) else: logger.exception("Exception processing handler on uid=%s", self.uuid) # Once it is logged, we want to crash out and let splunk bring us back up raise end_time = time() if end_time - start_time > 3600: logger.debug("Checking for cleanup on incomplete jobs uid=%s", self.uuid) # Check to see if we need to clean anything up self.unclaim_incomplete_jobs(stanza_config.get("host")) # Reset our start time start_time = time() logger.info("Exit modular input uid=%s", self.uuid) if __name__ == "__main__": worker = ItsiConsumerModularInput() worker.execute() sys.exit(0)