You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
498 lines
22 KiB
498 lines
22 KiB
# Copyright (C) 2005-2025 Splunk Inc. All Rights Reserved.
|
|
"""
|
|
This is a re-implementation of the refresh queue that we had before, with the major change that this one is designed to be run
|
|
concurrently. Itself, it will behave appropriately. However, it is paramount that any new/existing refresh queues be audited for
|
|
the amount of risk that they can have. Currently we have no transaction support from our supporting datastore, so
|
|
theres definitely a risk of race conditions.
|
|
"""
|
|
import json
|
|
import os
|
|
import signal
|
|
import sys
|
|
from abc import ABCMeta, abstractmethod
|
|
from multiprocessing.pool import ThreadPool
|
|
from time import time
|
|
|
|
from splunk.clilib.bundle_paths import make_splunkhome_path
|
|
from splunk.rest import simpleRequest
|
|
from splunk import LicenseRestriction
|
|
|
|
sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib']))
|
|
sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib', 'SA_ITOA_app_common']))
|
|
import itsi_path
|
|
|
|
from SA_ITOA_app_common.solnlib.modular_input import ModularInput
|
|
|
|
from ITOA import itoa_refresh_queue_utils
|
|
from ITOA.itoa_common import modular_input_should_run
|
|
from ITOA.mod_input_utils import skip_run_during_migration
|
|
from ITOA.setup_logging import logger, InstrumentCall, getLogger4ModInput
|
|
from itsi.objects.changehandlers.handler_manifest import handler_manifest
|
|
from itsi.objects.itsi_refresh_queue_job import ItsiRefreshQueueJob
|
|
from kvstore_queue.kvstore_queue_consumer import DEFAULT_HIGH_JOB_RATIO, DEFAULT_JOB_TIMEOUT, DEFAULT_MAX_RETRIES, KVStoreQueueConsumer
|
|
from kvstore_queue.refresh_queue_status_reporter import report_refresh_queue_status
|
|
|
|
_instrumentation = InstrumentCall(logger)
|
|
# Flag used for generating the restartless testing logs only once.
|
|
modular_input_reloaded = False
|
|
|
|
DEFAULT_POOL_SIZE = 8
|
|
DEFAULT_QUEUE_SIZE_LOGGING_FREQUENCY = 600 # Minimum number of seconds between attempts to log the size of the queue
|
|
INCOMPLETE_JOB_CLEANUP_INTERVAL = 3600
|
|
QUEUE_METRICS_REPORT_INTERVAL = 86400
|
|
|
|
|
|
class JobProcessor(metaclass=ABCMeta):
|
|
"""
|
|
Abstract interface for a job processor
|
|
"""
|
|
|
|
@abstractmethod
|
|
def preprocess_job(self, job_data, other_jobs):
|
|
"""
|
|
Perform job preprocessing steps before execution.
|
|
|
|
@param job_data: the current job object selected for processing
|
|
@type job_data: dict
|
|
|
|
@param other_jobs: list of other job objects in the queue; e.g. for deduplication
|
|
@type other_jobs: list(dict)
|
|
"""
|
|
|
|
@abstractmethod
|
|
def process_job(self, job_data):
|
|
"""
|
|
Perform job processing.
|
|
|
|
@param job_data: Job data for processing
|
|
@type job_data: dict
|
|
|
|
@return: True if processing was successful, otherwise False
|
|
@rtype: bool
|
|
"""
|
|
|
|
|
|
class JobDemultiplexer(JobProcessor):
|
|
"""
|
|
De-Multiplexer for ITSI objects. A user passes in a field that
|
|
will select what action to run - the demultiplexer field (demux_field)
|
|
Essentially the job type, but can be used for other things
|
|
"""
|
|
|
|
def __init__(self, session_key, demux_field, number_of_thread=DEFAULT_POOL_SIZE):
|
|
"""
|
|
Initialize the process object
|
|
@param session_key: The splunkd session key
|
|
@param demux_field: The field in the job data to demultiplex on
|
|
@param number_of_thread: number of thread to allocate - must be a none-zero value
|
|
"""
|
|
self.demux_field = demux_field
|
|
self.session_key = session_key
|
|
self.handler_manifest = handler_manifest
|
|
if number_of_thread < 1:
|
|
number_of_thread = DEFAULT_POOL_SIZE
|
|
self.thread_pool = ThreadPool(processes=number_of_thread)
|
|
self.refresh_queue_job_interface = ItsiRefreshQueueJob(self.session_key, 'nobody')
|
|
self.last_size_logging_time = None
|
|
|
|
def preprocess_job(self, job_data, other_jobs):
|
|
"""
|
|
If the handler is configured to, check for duplicate jobs
|
|
@param job_data: job object the handler is currently working on
|
|
@param other_jobs: list of job objects in the queue
|
|
@return: list of duplicate job objects
|
|
@rtype List
|
|
"""
|
|
job_changed_object_type = job_data.get("changed_object_type")
|
|
job_change_key = job_data.get("changed_object_key")
|
|
if job_changed_object_type and job_change_key:
|
|
selector = job_data.get(self.demux_field, None)
|
|
handler_object = self._get_handler(job_data, selector)(logger, self.session_key, self.thread_pool)
|
|
should_remove_duplicates = getattr(handler_object, 'should_remove_duplicates', None)
|
|
if should_remove_duplicates and callable(should_remove_duplicates) and should_remove_duplicates(job_data):
|
|
duplicates = [job for job in other_jobs if job.get("changed_object_type") == job_changed_object_type
|
|
and job.get("changed_object_key") == job_change_key]
|
|
return duplicates
|
|
return []
|
|
|
|
def process_job(self, job_data):
|
|
"""
|
|
Using the demultiplexer field, process the job_data
|
|
@param job_data: JSON data coming in to be processed. In this case, it will be a refresh
|
|
queue job
|
|
@type job_data: Dictionary
|
|
@return: Was this job successful?
|
|
@rtype: Boolean
|
|
"""
|
|
start_time = time()
|
|
create_time = float(job_data.get("create_time"))
|
|
job_change_type = job_data.get("change_type")
|
|
job_change_key = job_data.get("changed_object_key", "Unknown Key")
|
|
transaction_id = job_data.get("transaction_id", None)
|
|
job_changed_object_type = job_data.get("changed_object_type", "Unknown Change Object Type")
|
|
successful = False
|
|
try:
|
|
selector = job_data.get(self.demux_field, None)
|
|
handler_object = self._get_handler(job_data, selector)(logger, self.session_key, self.thread_pool)
|
|
handler_object.assert_valid_change_object(job_data)
|
|
method_name = selector + ".deferred"
|
|
with _instrumentation.track(method_name, transaction_id):
|
|
itoa_refresh_queue_utils.process_map[os.getpid()] = job_data
|
|
successful = handler_object.deferred(job_data, job_data.get("transaction_id"))
|
|
except Exception:
|
|
logger.exception(
|
|
"Error processing job=%s change_type=%s tid=%s",
|
|
job_data.get("_key"),
|
|
job_data.get("change_type"),
|
|
job_data.get("transaction_id"),
|
|
)
|
|
raise
|
|
finally:
|
|
# Remove current job context
|
|
del itoa_refresh_queue_utils.process_map[os.getpid()]
|
|
|
|
# Log the entire experience
|
|
job_key = job_data.get("_key")
|
|
end_time = time()
|
|
job_time = end_time - start_time
|
|
queue_time = job_data.get("queue_time")
|
|
overall_time = end_time - create_time
|
|
completion = "Successful" if successful else "Failed"
|
|
number_of_failures = job_data.get("number_of_failures", 0)
|
|
parent_job = job_data.get("parent_job", None)
|
|
log_message = (
|
|
"Transaction: Job %s: job_key=%s tid=%s job_change_type=%s job_changed_object_type=%s "
|
|
"start_time=%s end_time=%s job_time=%s queue_time=%s transaction_time=%s job_change_key=%s "
|
|
"create_time=%s handler_object=%s number_of_failures=%s parent_job=%s "
|
|
)
|
|
current_time = time()
|
|
if self.last_size_logging_time is None \
|
|
or current_time - self.last_size_logging_time > DEFAULT_QUEUE_SIZE_LOGGING_FREQUENCY:
|
|
self.last_size_logging_time = current_time
|
|
current_queue_size = self.refresh_queue_job_interface.get_queue_size(
|
|
"nobody",
|
|
transaction_id=transaction_id
|
|
)
|
|
log_message += "current_queue_size=%s"
|
|
logger.info(
|
|
log_message, completion, job_key, transaction_id, job_change_type,
|
|
job_changed_object_type, start_time, end_time, job_time, queue_time,
|
|
overall_time, job_change_key, create_time, type(handler_object).__name__,
|
|
number_of_failures, parent_job, current_queue_size,
|
|
)
|
|
else:
|
|
logger.info(
|
|
log_message, completion, job_key, transaction_id, job_change_type,
|
|
job_changed_object_type, start_time, end_time, job_time, queue_time,
|
|
overall_time, job_change_key, create_time, type(handler_object).__name__,
|
|
number_of_failures, parent_job,
|
|
)
|
|
return successful
|
|
|
|
def _get_handler(self, job_data, selector):
|
|
"""
|
|
Get the handler class for job
|
|
@param job_data: the refresh queue job object
|
|
@param selector: the demux field
|
|
@return: Handler class
|
|
"""
|
|
key = job_data.get("_key")
|
|
transaction_id = job_data.get('transaction_id')
|
|
if selector is None:
|
|
raise Exception("No selector found in data field=%s, key=%s, tid=%s" %
|
|
(self.demux_field, key, transaction_id))
|
|
handler_class = self.handler_manifest.get(selector)
|
|
if handler_class is None:
|
|
raise Exception("No handler manifest could be found selector=%s, key=%s, tid=%s" %
|
|
(selector, key, transaction_id))
|
|
return handler_class
|
|
|
|
|
|
class ItsiConsumerModularInput(ModularInput):
|
|
"""
|
|
Modular input that processes job objects from the specified queue and collection
|
|
"""
|
|
# Required options for Modular Input
|
|
title = "ITSI Multiple Job Queue Processor"
|
|
description = "Runs deferred operations to ensure consistency for ITSI knowledge objects."
|
|
app = "SA-ITOA"
|
|
name = "itsi_consumer"
|
|
use_single_instance = False
|
|
use_kvstore_checkpointer = False
|
|
use_hec_event_writer = False
|
|
process_jobs = True
|
|
|
|
def __init__(self):
|
|
"""
|
|
Basic constructor
|
|
|
|
Initialize class and setup shutdown signals
|
|
"""
|
|
super(ItsiConsumerModularInput, self).__init__()
|
|
signal.signal(signal.SIGINT, self.shutdown)
|
|
signal.signal(signal.SIGTERM, self.shutdown)
|
|
|
|
def extra_arguments(self):
|
|
"""
|
|
Additional argument definitions required for the modular input class
|
|
"""
|
|
return [
|
|
{
|
|
'name': "log_level",
|
|
'title': 'Logging Level',
|
|
'description': 'Logging level to use for logging errors (ERROR, WARNING, INFO, DEBUG)',
|
|
},
|
|
{
|
|
'name': 'number_of_thread',
|
|
'title': 'Number of Thread',
|
|
'description': 'Sets the thread pool count, or the number of actions that can execute in parallel '
|
|
'within a single job. For example, multiple independent actions on different '
|
|
'entities can execute at once.',
|
|
},
|
|
{
|
|
'name': 'high_job_ratio',
|
|
'title': 'High Job Ratio',
|
|
'description': 'Executes high priority jobs N:1 compared to normal priority jobs. Setting this value '
|
|
'to 0 causes all jobs to execute regardless of priority.',
|
|
},
|
|
{
|
|
'name': 'job_timeout',
|
|
'title': 'Job Timeout',
|
|
'description': 'The maximum amount of time, in seconds, that a job can execute. Jobs that exceed '
|
|
'this time limit will not run, and generate a timeout error. Setting this value to 0 '
|
|
'turns off the job timeout.',
|
|
},
|
|
{
|
|
'name': 'max_retries',
|
|
'title': 'Max Retries',
|
|
'description': 'The number of times a failed job can automatically attempt to run again. '
|
|
'Setting this value to 0 turns off retry attempts.',
|
|
},
|
|
]
|
|
|
|
# It may seem a little strange to have these wrapped, but the good reason is
|
|
# So that we can do good unit testing around this feature
|
|
def request_shc_members(self):
|
|
"""
|
|
The reason that we split this up into a separate method is so that we can
|
|
abstract everything else out and test it by overwriting this method
|
|
"""
|
|
return simpleRequest('/services/shcluster/member/members',
|
|
sessionKey=self.session_key,
|
|
getargs={"output_mode": "json"},
|
|
raiseAllErrors=False)
|
|
|
|
# It may seem a little strange to have these wrapped, but the good reason is
|
|
# So that we can do good unit testing around this feature
|
|
def request_input_conf_entries(self):
|
|
"""
|
|
The reason that we split this up into a separate method is so that we can
|
|
abstract everything else out and test it by overwriting this method
|
|
"""
|
|
return simpleRequest('/services/properties/inputs',
|
|
sessionKey=self.session_key,
|
|
getargs={"output_mode": "json"},
|
|
raiseAllErrors=False)
|
|
|
|
def unclaim_incomplete_jobs(self, localhost):
|
|
"""
|
|
So, with multiple processing queues, what happens if one dies unexpectedly? We need to
|
|
find a way to
|
|
1) Determine that the process has died
|
|
2) Adjust all of its jobs to unclaimed
|
|
3) Do this all within splunk. The worst part.
|
|
|
|
For 1, what we need to do is first compare what hosts are up and which ones are not. Now,
|
|
this is only possible as far as the different search heads share the same kvstore - which
|
|
is only possible if they are clustered. That means that we can look through all of the
|
|
hosts that we're informed of in our shc (or single instance) and see if there are any hosts
|
|
in the list of jobs that are not in the list provided by splunk
|
|
|
|
If there are any jobs assigned to hosts that don't exist, we put those back on the open
|
|
market.
|
|
|
|
Next, we look for jobs that are assigned to job processors that don't exist.
|
|
Unfortunately, we can't check other hosts that well, but we can check our own host
|
|
pretty easily. So lets do that. Once we have dead hosts and dead input stanzas covered,
|
|
we've covered a large portion of the jobs. The only thing that isn't covered are hosts up
|
|
that have 0 jobs assigned to them (usually through user intervention).
|
|
|
|
We'll need a special way of dealing with that case
|
|
|
|
For 2, thats easy. Just rewrite the job itself to take out the processing piece
|
|
|
|
For 3, we'll be limited to conf files, the kvstore, etc. Not great, but whatever
|
|
"""
|
|
# Step 0: Are we a part of a shc?
|
|
valid_hosts = []
|
|
try:
|
|
response, content = self.request_shc_members()
|
|
if response.status != 200:
|
|
# Assume that we are not a part of the search head cluster, just grab localhost
|
|
is_shc = False # noqa F842
|
|
valid_hosts.append(localhost)
|
|
else:
|
|
is_shc = True # noqa F842
|
|
parsed_content = json.loads(content)
|
|
for entry in parsed_content.get("entry", []):
|
|
entry_content = entry.get('content', {})
|
|
label = entry_content.get("label")
|
|
status = entry_content.get("status")
|
|
if label is not None and status == "Up":
|
|
valid_hosts.append(label)
|
|
except LicenseRestriction:
|
|
logger.exception("Must update your license. Continuing with localhost awareness only.")
|
|
valid_hosts.append(localhost)
|
|
# Step 1: Check for any hosts that are not a part of the shc.
|
|
# Find jobs that are assigned to the dead hosts
|
|
valid_local_workers = []
|
|
response, content = self.request_input_conf_entries()
|
|
if response.status == 200:
|
|
# If we don't get status = 200, then we should skip any of the local workers
|
|
parsed_content = json.loads(content)
|
|
for entry in parsed_content.get("entry", []):
|
|
entry_content = entry.get('name')
|
|
if "itsi_consumer://" in entry_content:
|
|
valid_local_workers.append(entry_content + ":" + localhost)
|
|
# Step 2: Check for any jobs that are assigned to workers not present on this host.
|
|
if not hasattr(self, "refresh_queue_job_interface"):
|
|
self.refresh_queue_job_interface = ItsiRefreshQueueJob(self.session_key, 'nobody')
|
|
|
|
queue_jobs = self.refresh_queue_job_interface.get_bulk('nobody')
|
|
redo_jobs = []
|
|
for job in queue_jobs:
|
|
processor = job.get("processor")
|
|
if processor is None or len(processor) == 0:
|
|
continue
|
|
worker_host = processor[processor.rfind(":") + 1:]
|
|
if worker_host not in valid_hosts:
|
|
redo_jobs.append(job)
|
|
elif processor not in valid_local_workers and worker_host == localhost:
|
|
redo_jobs.append(job)
|
|
|
|
# Individually save all of the jobs
|
|
for job in redo_jobs:
|
|
old_job = self.refresh_queue_job_interface.get('nobody', job['_key'])
|
|
if old_job.get('processor') == job.get('processor'):
|
|
# Step 3: For any jobs in 1 or 2,
|
|
# nullify their workers (if they are not claimed by other workers)
|
|
del old_job['processor']
|
|
self.refresh_queue_job_interface.update('nobody', job['_key'], old_job)
|
|
|
|
@skip_run_during_migration
|
|
def do_run(self, stanzas):
|
|
"""
|
|
Run the modular input
|
|
"""
|
|
if len(stanzas) == 0:
|
|
# The feature is disabled, no stanzas are present.
|
|
return
|
|
|
|
logger = getLogger4ModInput(stanzas)
|
|
# log the message for restartless upgrade testing which can be useful while debugging.
|
|
global modular_input_reloaded
|
|
if not modular_input_reloaded:
|
|
stanza_name = next(iter(stanzas.keys()))
|
|
logger.info(f"Restartless upgrade - Reloaded modular input {stanza_name}")
|
|
modular_input_reloaded = True
|
|
# We only want the first instance of the stanza, it has the name that we want
|
|
|
|
if not modular_input_should_run(self.session_key, logger=logger):
|
|
logger.info("Will not run itsi_consumer queue on this node")
|
|
return
|
|
|
|
stanza_name = next(iter(stanzas.keys()))
|
|
stanza_config = next(iter(stanzas.values()))
|
|
|
|
level = stanza_config.get("log_level")
|
|
if level is not None:
|
|
level = level.upper()
|
|
if level not in ["ERROR", "WARN", "WARNING", "INFO", "DEBUG"]:
|
|
level = "INFO"
|
|
logger.setLevel(level)
|
|
|
|
# A mostly unique identifier that will persist across different iterations
|
|
# Here we're assuming that the stanza name itself is unique across stanza types
|
|
# It could be a uuid though
|
|
# Anything as long as it persists across reboots
|
|
self.uuid = stanza_name + ":" + stanza_config.get("host")
|
|
logger.debug("Running with uid=%s", self.uuid)
|
|
|
|
try:
|
|
number_of_thread = int(stanza_config.get("number_of_thread", DEFAULT_POOL_SIZE))
|
|
except ValueError:
|
|
number_of_thread = DEFAULT_POOL_SIZE
|
|
try:
|
|
high_job_ratio = int(stanza_config.get("high_job_ratio", DEFAULT_HIGH_JOB_RATIO))
|
|
high_job_ratio = max(0, high_job_ratio)
|
|
except ValueError:
|
|
high_job_ratio = DEFAULT_HIGH_JOB_RATIO
|
|
try:
|
|
job_timeout = int(stanza_config.get("job_timeout", DEFAULT_JOB_TIMEOUT))
|
|
job_timeout = max(0, job_timeout)
|
|
except ValueError:
|
|
job_timeout = DEFAULT_JOB_TIMEOUT
|
|
try:
|
|
max_retries = int(stanza_config.get("max_retries", DEFAULT_MAX_RETRIES))
|
|
max_retries = max(0, max_retries)
|
|
except ValueError:
|
|
max_retries = DEFAULT_MAX_RETRIES
|
|
|
|
logger.debug('number of thread is set to %s', number_of_thread)
|
|
demux = JobDemultiplexer(self.session_key, 'change_type', number_of_thread)
|
|
self.consumer = KVStoreQueueConsumer(
|
|
self.session_key,
|
|
logger,
|
|
self.uuid,
|
|
demux,
|
|
high_job_ratio,
|
|
job_timeout,
|
|
max_retries,
|
|
)
|
|
# Next, wait for the kvstore to get up and running
|
|
self.consumer.block_until_ready()
|
|
last_incomplete_job_cleanup = None
|
|
last_queue_metrics_report = None
|
|
|
|
while self.process_jobs:
|
|
try:
|
|
self.consumer.process_job()
|
|
except Exception as e:
|
|
if "Splunkd daemon is not responding: " in str(e):
|
|
logger.warning('Connection issue while processing handler on uid=%s. "%s" If this message occurs '
|
|
'only once, KV Store may still be initializing.', self.uuid, e)
|
|
else:
|
|
logger.exception("Exception processing handler on uid=%s", self.uuid)
|
|
# Once it is logged, we want to crash out and let splunk bring us back up
|
|
raise
|
|
end_time = time()
|
|
|
|
if (last_incomplete_job_cleanup is None
|
|
or end_time - last_incomplete_job_cleanup > INCOMPLETE_JOB_CLEANUP_INTERVAL):
|
|
logger.debug("Checking for cleanup on incomplete jobs uid=%s", self.uuid)
|
|
self.unclaim_incomplete_jobs(stanza_config.get("host"))
|
|
last_incomplete_job_cleanup = end_time
|
|
|
|
if last_queue_metrics_report is None or end_time - last_queue_metrics_report > QUEUE_METRICS_REPORT_INTERVAL:
|
|
report_refresh_queue_status(logger, self.session_key, "nobody")
|
|
last_queue_metrics_report = end_time
|
|
|
|
logger.info("Exit modular input uid=%s", self.uuid)
|
|
|
|
# pylint: disable=unused-argument
|
|
def shutdown(self, signum, frame):
|
|
"""
|
|
Listen for signals and shutdown processing
|
|
"""
|
|
logger.info(
|
|
f"itsi_consumer received queue shutdown signal {signum}; "
|
|
"finishing current job and exiting"
|
|
)
|
|
self.process_jobs = False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
worker = ItsiConsumerModularInput()
|
|
worker.execute()
|
|
sys.exit(0)
|