SH-Deployer/apps/SA-ITOA/bin/itsi_consumer.py

# Copyright (C) 2005-2024 Splunk Inc. All Rights Reserved.
"""
This is a re-implementation of the refresh queue that we had before, with the major change that this one is designed to be run
concurrently.  Itself, it will behave appropriately.  However, it is paramount that any new/existing refresh queues be audited for
the amount of risk that they can have.  Currently we have no transaction support from our supporting datastore, so
theres definitely a risk of race conditions.
"""
import json
import os
import sys
from builtins import object
from multiprocessing.pool import ThreadPool
from time import time

from splunk.clilib.bundle_paths import make_splunkhome_path
from splunk.rest import simpleRequest
from splunk import LicenseRestriction

sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib']))
sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib', 'SA_ITOA_app_common']))
import itsi_path

from SA_ITOA_app_common.solnlib.modular_input import ModularInput

from ITOA import itoa_refresh_queue_utils
from ITOA.itoa_common import modular_input_should_run
from ITOA.mod_input_utils import skip_run_during_migration
from ITOA.setup_logging import logger, InstrumentCall, getLogger4ModInput
from itsi.objects.changehandlers.handler_manifest import handler_manifest
from itsi.objects.itsi_refresh_queue_job import ItsiRefreshQueueJob
from kvstore_queue.kvstore_queue_consumer import KVStoreQueueConsumer

_instrumentation = InstrumentCall(logger)
# Flag used for generating the restartless testing logs only once.
modular_input_reloaded = False

DEFAULT_POOL_SIZE = 8
DEFAULT_QUEUE_SIZE_LOGGING_FREQUENCY = 600  # Minimum number of seconds between attempts to log the size of the queue


class JobDemultiplexer(object):
    """
    De-Multiplexer for ITSI objects.  A user passes in a field that
    will select what action to run - the demultiplexer field (demux_field)
    Essentially the job type, but can be used for other things
    """

    def __init__(self, session_key, demux_field, number_of_thread=DEFAULT_POOL_SIZE):
        """
        Initialize the process object
        @param session_key: The splunkd session key
        @param demux_field: The field in the job data to demultiplex on
        @param number_of_thread: number of thread to allocate - must be a none-zero value
        """
        self.demux_field = demux_field
        self.session_key = session_key
        self.handler_manifest = handler_manifest
        if number_of_thread < 1:
            number_of_thread = DEFAULT_POOL_SIZE
        self.thread_pool = ThreadPool(processes=number_of_thread)
        self.refresh_queue_job_interface = ItsiRefreshQueueJob(self.session_key, 'nobody')
        self.last_size_logging_time = None

    def preprocess_job(self, job_data, other_jobs):
        """
        If the handler is configured to, check for duplicate jobs
        @param job_data: job object the handler is currently working on
        @param other_jobs: list of job objects in the queue
        @return: list of duplicate job objects
        @rtype List
        """
        job_changed_object_type = job_data.get("changed_object_type")
        job_change_key = job_data.get("changed_object_key")
        if job_changed_object_type and job_change_key:
            selector = job_data.get(self.demux_field, None)
            handler_object = self._get_handler(job_data, selector)(logger, self.session_key, self.thread_pool)
            should_remove_duplicates = getattr(handler_object, 'should_remove_duplicates', None)
            if should_remove_duplicates and callable(should_remove_duplicates) and should_remove_duplicates(job_data):
                duplicates = [job for job in other_jobs if job.get("changed_object_type") == job_changed_object_type
                              and job.get("changed_object_key") == job_change_key]
                return duplicates
        return []

    def process_job(self, job_data):
        """
        Using the demultipler field, process the job_data
        @param job_data: JSON data coming in to be processed.  In this case, it will be a refresh
        queue job
        @type job_data: Dictionary
        @return: Was this job successful?
        @rtype: Boolean
        """
        start_time = time()
        create_time = float(job_data.get("create_time"))
        job_change_type = job_data.get("change_type")
        job_change_key = job_data.get("changed_object_key", "Unknown Key")
        transaction_id = job_data.get("transaction_id", None)
        job_changed_object_type = job_data.get("changed_object_type", "Unknown Change Object Type")
        try:
            selector = job_data.get(self.demux_field, None)
            handler_object = self._get_handler(job_data, selector)(logger, self.session_key, self.thread_pool)
            handler_object.assert_valid_change_object(job_data)
            method_name = selector + ".deferred"
            with _instrumentation.track(method_name, transaction_id):
                itoa_refresh_queue_utils.process_map[os.getpid()] = job_data.get('_key')
                successful = handler_object.deferred(job_data, job_data.get('transaction_id'))
        except Exception:
            successful = False
            logger.exception("Error processing job=%s change_type=%s tid=%s",
                             job_data.get("_key"),
                             job_data.get("change_type"),
                             job_data.get("transaction_id"))
            raise
        finally:
            # Log the entire experience
            job_key = job_data.get("_key")
            end_time = time()
            job_time = end_time - start_time
            queue_time = job_data.get('queue_time')
            overall_time = end_time - create_time
            completion = "Successful"
            if not successful:
                completion = "Failed"
            number_of_failures = job_data.get('number_of_failures', 0)
            parent_job = job_data.get('job_data', '')
            logFormat = "Transaction: Job %s: job_key=%s tid=%s job_change_type=%s job_changed_object_type=%s " \
                        "start_time=%s end_time=%s job_time=%s queue_time=%s transaction_time=%s job_change_key=%s " \
                        "create_time=%s handler_object=%s number_of_failures=%s parent_job=%s "
            current_time = time()
            if self.last_size_logging_time is None \
                    or current_time - self.last_size_logging_time > DEFAULT_QUEUE_SIZE_LOGGING_FREQUENCY:
                self.last_size_logging_time = current_time
                current_queue_size = len(self.refresh_queue_job_interface.get_bulk('nobody',
                                                                                   fields=['_key', 'transaction_id']))
                logFormat += "current_queue_size=%s"
                logger.info(
                    logFormat, completion, job_key, transaction_id, job_change_type, job_changed_object_type,
                    start_time, end_time, job_time, queue_time, overall_time, job_change_key, create_time,
                    type(handler_object).__name__, number_of_failures, parent_job, current_queue_size,
                )
            else:
                logger.info(
                    logFormat, completion, job_key, transaction_id, job_change_type, job_changed_object_type,
                    start_time, end_time, job_time, queue_time, overall_time, job_change_key, create_time,
                    type(handler_object).__name__, number_of_failures, parent_job,
                )
            return successful

    def _get_handler(self, job_data, selector):
        """
        Get the handler class for job
        @param job_data: the refresh queue job object
        @param selector: the demux field
        @return: Handler class
        """
        key = job_data.get("_key")
        transaction_id = job_data.get('transaction_id')
        if selector is None:
            raise Exception("No selector found in data field=%s, key=%s, tid=%s" %
                            (self.demux_field, key, transaction_id))
        handler_class = self.handler_manifest.get(selector)
        if handler_class is None:
            raise Exception("No handler manifest could be found selector=%s, key=%s, tid=%s" %
                            (selector, key, transaction_id))
        return handler_class


class ItsiConsumerModularInput(ModularInput):
    """
    Modular input that processes job objects from the specified queue and collection
    """
    # Required options for Modular Input
    title = "ITSI Multiple Job Queue Processor"
    description = "Runs deferred operations to ensure consistency for ITSI knowledge objects."
    app = "SA-ITOA"
    name = "itsi_consumer"
    use_single_instance = False
    use_kvstore_checkpointer = False
    use_hec_event_writer = False

    def extra_arguments(self):
        """
        Additional argument definitions required for the modular input class
        """
        return [{'name': "log_level",
                 'title': 'Logging Level',
                 'description': 'Logging level to use for logging errors (ERROR, WARNING, INFO, DEBUG)'
                 }
                ]

    # It may seem a little strange to have these wrapped, but the good reason is
    # So that we can do good unit testing around this feature
    def request_shc_members(self):
        """
        The reason that we split this up into a separate method is so that we can
        abstract everything else out and test it by overwriting this method
        """
        return simpleRequest('/services/shcluster/member/members',
                             sessionKey=self.session_key,
                             getargs={"output_mode": "json"},
                             raiseAllErrors=False)

    # It may seem a little strange to have these wrapped, but the good reason is
    # So that we can do good unit testing around this feature
    def request_input_conf_entries(self):
        """
        The reason that we split this up into a separate method is so that we can
        abstract everything else out and test it by overwriting this method
        """
        return simpleRequest('/services/properties/inputs',
                             sessionKey=self.session_key,
                             getargs={"output_mode": "json"},
                             raiseAllErrors=False)

    def unclaim_incomplete_jobs(self, localhost):
        """
        So, with multiple processing queues, what happens if one dies unexpectedly?  We need to
        find a way to
        1) Determine that the process has died
        2) Adjust all of its jobs to unclaimed
        3) Do this all within splunk.  The worst part.

        For 1, what we need to do is first compare what hosts are up and which ones are not.  Now,
        this is only possible as far as the different search heads share the same kvstore - which
        is only possible if they are clustered. That means that we can look through all of the
        hosts that we're informed of in our shc (or single instance) and see if there are any hosts
        in the list of jobs that are not in the list provided by splunk

        If there are any jobs assigned to hosts that don't exist, we put those back on the open
        market.

        Next, we look for jobs that are assigned to job processors that don't exist.
        Unfortunately, we can't check other hosts that well, but we can check our own host
        pretty easily.  So lets do that.  Once we have dead hosts and dead input stanzas covered,
        we've covered a large portion of the jobs.  The only thing that isn't covered are hosts up
        that have 0 jobs assigned to them (usually through user intervention).

        We'll need a special way of dealing with that case

        For 2, thats easy.  Just rewrite the job itself to take out the processing piece

        For 3, we'll be limited to conf files, the kvstore, etc.  Not great, but whatever
        """
        # Step 0: Are we a part of a shc?
        valid_hosts = []
        try:
            response, content = self.request_shc_members()
            if response.status != 200:
                # Assume that we are not a part of the search head cluster, just grab localhost
                is_shc = False  # noqa F842
                valid_hosts.append(localhost)
            else:
                is_shc = True  # noqa F842
                parsed_content = json.loads(content)
                for entry in parsed_content.get("entry", []):
                    entry_content = entry.get('content', {})
                    label = entry_content.get("label")
                    status = entry_content.get("status")
                    if label is not None and status == "Up":
                        valid_hosts.append(label)
        except LicenseRestriction:
            logger.exception("Must update your license. Continuing with localhost awareness only.")
            valid_hosts.append(localhost)
        # Step 1: Check for any hosts that are not a part of the shc.
        # Find jobs that are assigned to the dead hosts
        valid_local_workers = []
        response, content = self.request_input_conf_entries()
        if response.status == 200:
            # If we don't get status = 200, then we should skip any of the local workers
            parsed_content = json.loads(content)
            for entry in parsed_content.get("entry", []):
                entry_content = entry.get('name')
                if "itsi_consumer://" in entry_content:
                    valid_local_workers.append(entry_content + ":" + localhost)
        # Step 2: Check for any jobs that are assigned to workers not present on this host.
        if not hasattr(self, "refresh_queue_job_interface"):
            self.refresh_queue_job_interface = ItsiRefreshQueueJob(self.session_key, 'nobody')

        queue_jobs = self.refresh_queue_job_interface.get_bulk('nobody')
        redo_jobs = []
        for job in queue_jobs:
            processor = job.get("processor")
            if processor is None or len(processor) == 0:
                continue
            worker_host = processor[processor.rfind(":") + 1:]
            if worker_host not in valid_hosts:
                redo_jobs.append(job)
            elif processor not in valid_local_workers and worker_host == localhost:
                redo_jobs.append(job)

        # Individually save all of the jobs
        for job in redo_jobs:
            old_job = self.refresh_queue_job_interface.get('nobody', job['_key'])
            if old_job.get('processor') == job.get('processor'):
                # Step 3: For any jobs in 1 or 2,
                # nullify their workers (if they are not claimed by other workers)
                del old_job['processor']
                self.refresh_queue_job_interface.update('nobody', job['_key'], old_job)

    @skip_run_during_migration
    def do_run(self, stanzas):
        """
        Run the modular input
        """
        if len(stanzas) == 0:
            # The feature is disabled, no stanzas are present.
            return

        logger = getLogger4ModInput(stanzas)
        # log the message for restartless upgrade testing which can be useful while debugging.
        global modular_input_reloaded
        if not modular_input_reloaded:
            stanza_name = next(iter(stanzas.keys()))
            logger.info(f"Restartless upgrade - Reloaded modular input {stanza_name}")
            modular_input_reloaded = True
        # We only want the first instance of the stanza, it has the name that we want

        if not modular_input_should_run(self.session_key, logger=logger):
            logger.info("Will not run itsi_consumer queue on this node")
            return

        stanza_name = next(iter(stanzas.keys()))
        stanza_config = next(iter(stanzas.values()))
        # A mostly unique identifier that will persist across different iterations
        # Here we're assuming that the stanza name itself is unique across stanza types
        # It could be a uuid though
        # Anything as long as it persists across reboots
        self.uuid = stanza_name + ":" + stanza_config.get('host')
        logger.debug("Running with uid=%s", self.uuid)
        level = stanza_config.get("log_level")
        if level is not None:
            level = level.upper()
            if level not in ["ERROR", "WARN", "WARNING", "INFO", "DEBUG"]:
                level = "INFO"
            logger.setLevel(level)

        try:
            number_of_thread = int(stanza_config.get('number_of_thread', DEFAULT_POOL_SIZE))
        except Exception:
            number_of_thread = DEFAULT_POOL_SIZE

        logger.debug('number of thread is set to %s', number_of_thread)
        demux = JobDemultiplexer(self.session_key, 'change_type', number_of_thread)
        self.consumer = KVStoreQueueConsumer(self.session_key, logger, self.uuid, demux)
        # Next, wait for the kvstore to get up and running
        self.consumer.block_until_ready()
        start_time = time()
        while True:
            try:
                self.consumer.process_job()
            except Exception as e:
                if "Splunkd daemon is not responding: " in str(e):
                    logger.warning('Connection issue while processing handler on uid=%s. "%s" If this message occurs '
                                   'only once, KV Store may still be initializing.', self.uuid, e)
                else:
                    logger.exception("Exception processing handler on uid=%s", self.uuid)
                # Once it is logged, we want to crash out and let splunk bring us back up
                raise
            end_time = time()
            if end_time - start_time > 3600:
                logger.debug("Checking for cleanup on incomplete jobs uid=%s", self.uuid)
                # Check to see if we need to clean anything up
                self.unclaim_incomplete_jobs(stanza_config.get("host"))
                # Reset our start time
                start_time = time()

        logger.info("Exit modular input uid=%s", self.uuid)


if __name__ == "__main__":
    worker = ItsiConsumerModularInput()
    worker.execute()
    sys.exit(0)