You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/lib/trackme_libs_kvstore_batch.py

136 lines
5.1 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
# Standard library imports
import os
import sys
import time
import logging
import json
import itertools
# Networking and URL handling imports
import requests
from urllib.parse import urlencode
import urllib3
# multithreading
import concurrent.futures
# Disable insecure request warnings for urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# splunk home
splunkhome = os.environ["SPLUNK_HOME"]
# append lib
sys.path.append(os.path.join(splunkhome, "etc", "apps", "trackme", "lib"))
# logging:
# To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition
# and rely on callers themselves
def batch_update_worker(
collection_name,
collection_object,
inputs_dict_or_list,
parent_instance_id,
task_instance_id,
task_name="batch_update",
max_multi_thread_workers=16,
):
"""
Function to handle batch updates in parallel using ThreadPoolExecutor
"""
batch_update_collection_start = time.time()
max_retries = 3
# Handle both dictionary and list inputs
final_records = []
if isinstance(inputs_dict_or_list, dict):
# loop through the collection_object dict and add to the list
for key, value in inputs_dict_or_list.items():
value["_key"] = key
final_records.append(value)
elif isinstance(inputs_dict_or_list, list):
# If it's a list, use the records directly
final_records = inputs_dict_or_list
# process by chunk
chunks = [final_records[i : i + 500] for i in range(0, len(final_records), 500)]
total_records = sum(len(chunk) for chunk in chunks)
successful_updates = 0
failed_updates = 0
def process_chunk(chunk_data):
chunk_idx, chunk = chunk_data
chunk_success = 0
chunk_failed = 0
retry_count = 0
while retry_count < max_retries:
try:
collection_object.data.batch_save(*chunk)
chunk_success = len(chunk)
break
except Exception as e:
retry_count += 1
if retry_count == max_retries:
chunk_failed = len(chunk)
logging.error(
f'parent_instance_id={parent_instance_id}, task="{task_name}", task_instance_id={task_instance_id}, KVstore batch failed after {max_retries} retries with exception="{str(e)}", chunk={chunk_idx}/{len(chunks)}, collection="{collection_name}"'
)
else:
logging.warning(
f'parent_instance_id={parent_instance_id}, task="{task_name}", task_instance_id={task_instance_id}, KVstore batch failed, retrying ({retry_count}/{max_retries}) with exception="{str(e)}", chunk={chunk_idx}/{len(chunks)}, collection="{collection_name}"'
)
time.sleep(1) # Add small delay between retries
return chunk_success, chunk_failed
# Use ThreadPoolExecutor for parallel processing
# max_multi_thread_workers is the absolute maximum allowed by configuration
# We might use fewer workers if we have fewer chunks to process
max_workers = max(
1, min(max_multi_thread_workers, len(chunks))
) # Never exceed configured max, but might use fewer if we have less work, and never less than 1
logging.info(
f'parent_instance_id={parent_instance_id}, task="{task_name}", task_instance_id={task_instance_id}, context="perf", parallel processing configuration, configured_max_workers="{max_multi_thread_workers}", actual_workers="{max_workers}", total_chunks="{len(chunks)}", collection="{collection_name}"'
)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all chunks for processing
future_to_chunk = {
executor.submit(process_chunk, (idx, chunk)): (idx, chunk)
for idx, chunk in enumerate(chunks, 1)
}
# Process results as they complete
for future in concurrent.futures.as_completed(future_to_chunk):
chunk_success, chunk_failed = future.result()
successful_updates += chunk_success
failed_updates += chunk_failed
# perf counter for the batch operation
run_time = round((time.time() - batch_update_collection_start), 3)
success_rate = (
round((successful_updates / total_records) * 100, 2)
if total_records > 0
else 0
)
logging.info(
f'parent_instance_id={parent_instance_id}, task="{task_name}", task_instance_id={task_instance_id}, context="perf", batch KVstore update terminated, total_records="{total_records}", successful="{successful_updates}", failed="{failed_updates}", success_rate="{success_rate}%", run_time="{run_time}", collection="{collection_name}"'
)