#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" # Standard library imports import os import sys import time import logging import json import itertools # Networking and URL handling imports import requests from urllib.parse import urlencode import urllib3 # multithreading from concurrent.futures import ThreadPoolExecutor, as_completed # Disable insecure request warnings for urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # splunk home splunkhome = os.environ["SPLUNK_HOME"] # append lib sys.path.append(os.path.join(splunkhome, "etc", "apps", "trackme", "lib")) # import trackme libs from trackme_libs import ( run_splunk_search, ) # logging: # To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition # and rely on callers themselves def search_kv_collection_restmode( logger, headers, splunkd_uri, collection_name, page=1, page_count=0, key_filter=None, object_filter=None, orderby="keyid", ): """ Get records from a KVstore collection using REST API. :param headers: The headers to use for the request. :param splunkd_uri: The Splunkd URI. :param collection_name: The name of the collection to query. :param page: The page number to retrieve. :param page_count: The number of records to retrieve per page. :param key_filter: The key filter to apply to the query. :param object_filter: The object filter to apply to the query. :param orderby: The order by field to use for the query. :return: A tuple containing the records, keys, a dictionary of the records, and last_page. """ # check orderby argument if orderby not in ["keyid", "object"]: raise ValueError(f'invalid orderby argument="{orderby}"') start_time = time.time() collection_dict = {} try: # Create a session for connection pooling with requests.Session() as session: session.headers.update(headers) session.verify = False # Build base URL url = f"{splunkd_uri}/servicesNS/nobody/trackme/storage/collections/data/{collection_name}" # Add filter if specified if key_filter: url = f"{url}/{key_filter}" elif object_filter: query_dict = {"object": {"$eq": object_filter}} query = f"?{urlencode({'query': json.dumps(query_dict)})}" url = f"{url}{query}" # If pagination is needed, use it directly in the request if page_count > 0: skip = (page - 1) * page_count params = { "output_mode": "json", "skip": skip, "limit": page_count, } # Make the request response = session.get( url, params=params, timeout=600, ) response.raise_for_status() response_json = response.json() # Process results efficiently for item in response_json: if orderby == "keyid": key = item.get("_key") if key: # Only process items with valid keys collection_dict[key] = item elif orderby == "object": object = item.get("object") if object: # Only process items with valid objects collection_dict[object] = item else: # For non-paginated requests, fetch all records in chunks chunk_size = 10000 # KVstore's default limit skip = 0 while True: params = { "output_mode": "json", "skip": skip, "limit": chunk_size, } # Make the request response = session.get( url, params=params, timeout=600, ) response.raise_for_status() response_json = response.json() # If no more records, break the loop if not response_json: break # Process results efficiently for item in response_json: if orderby == "keyid": key = item.get("_key") if key: # Only process items with valid keys collection_dict[key] = item elif orderby == "object": object = item.get("object") if object: # Only process items with valid objects collection_dict[object] = item # If we got less than chunk_size records, we've reached the end if len(response_json) < chunk_size: break # Move to next chunk skip += chunk_size # Convert to required formats only once collection_records = list(collection_dict.values()) collection_records_keys = set(collection_dict.keys()) # Handle pagination if page_count == 0: last_page = 1 else: # Get total count for pagination count_url = f"{splunkd_uri}/servicesNS/nobody/trackme/storage/collections/data/{collection_name}/count" if object_filter: count_url += f"?{urlencode({'query': json.dumps({'object': {'$eq': object_filter}})})}" count_response = session.get( count_url, params={"output_mode": "json"}, timeout=600, ) count_response.raise_for_status() total_count = count_response.json().get("count", 0) last_page = (total_count + page_count - 1) // page_count except Exception as e: msg = f'REST query failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) logging.info( f'context="perf", search_kv_collection_rest, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"' ) return collection_records, collection_records_keys, collection_dict, last_page def search_kv_collection_searchmode( logger, service, collection_name, page=1, page_count=0, key_filter=None, object_filter=None, orderby="keyid", ): """ Get records from a KVstore collection using a Splunk search. :param service: The Splunk service object. :param collection_name: The name of the collection to query. :param page: The page number to retrieve. :param page_count: The number of records to retrieve per page. :param key_filter: The key filter to apply to the query. :param object_filter: The object filter to apply to the query. :param orderby: The order by field to use for the query. :return: A tuple containing the records, keys, a dictionary of the records, and last_page. """ # check orderby argument if orderby not in ["keyid", "object"]: raise ValueError(f'invalid orderby argument="{orderby}"') start_time = time.time() collection_dict = {} try: # Build the search command efficiently search_parts = [f'| inputlookup {collection_name.replace("kv_", "")}'] # Add filter if specified if key_filter: search_parts.append(f'where keyid="{key_filter}"') elif object_filter: search_parts.append(f'where object="{object_filter}"') # Add pagination if needed if page_count > 0: search_parts.append(f"| head {page_count} | tail {page_count}") # Complete the search search_parts.append("| eval keyid=_key") search = " ".join(search_parts) # Optimize search parameters kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } # Execute search and process results reader = run_splunk_search( service, search, kwargs_search, 24, # max_retries 5, # retry_delay ) # Process results efficiently for item in reader: if isinstance(item, dict): # orderby=keyid if orderby == "keyid": key = item.get("keyid") if key: # Only process items with valid keys collection_dict[key] = item elif orderby == "object": object = item.get("object") if object: # Only process items with valid objects collection_dict[object] = item # Convert to required formats only once collection_records = list(collection_dict.values()) collection_records_keys = set(collection_dict.keys()) # Handle pagination if page_count == 0: last_page = 1 else: # Get total count for pagination count_search = f'| inputlookup {collection_name.replace("kv_", "")}' if key_filter: count_search += f' where keyid="{key_filter}"' elif object_filter: count_search += f' where object="{object_filter}"' count_search += " | stats count" count_reader = run_splunk_search( service, count_search, kwargs_search, 24, 5, ) total_count = 0 for item in count_reader: if isinstance(item, dict) and "count" in item: total_count = int(item["count"]) break last_page = (total_count + page_count - 1) // page_count except Exception as e: msg = f'main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) logging.info( f'context="perf", search_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"' ) return collection_records, collection_records_keys, collection_dict, last_page def search_kv_collection_sdkmode( logger, service, collection_name, page=1, page_count=0, key_filter=None, object_filter=None, orderby="keyid", ): """ Get records from a KVstore collection using a Splunk search. :param service: The Splunk service object. :param collection_name: The name of the collection to query. :param page: The page number to retrieve. :param page_count: The number of records to retrieve per page. :param key_filter: The key filter to apply to the query. :param object_filter: The object filter to apply to the query. :param orderby: The order by field to use for the query. :return: A tuple containing the records, keys, a dictionary of the records, and last_page. """ # check orderby argument if orderby not in ["keyid", "object"]: raise ValueError(f'invalid orderby argument="{orderby}"') start_time = time.time() collection_dict = {} # connect to the collection collection = service.kvstore[collection_name] # add filter, if any if key_filter: query_string = {"keyid": key_filter} elif object_filter: query_string = {"object": object_filter} else: query_string = {} try: if query_string: # For filtered queries, we can fetch all matching records at once process_collection_records = collection.data.query( query=json.dumps(query_string) ) for item in process_collection_records: if orderby == "keyid": collection_dict[item.get("_key")] = item elif orderby == "object": collection_dict[item.get("object")] = item else: # For unfiltered queries, we need to use chunked fetching chunk_size = 10000 # KVstore's default limit skip_tracker = 0 while True: process_collection_records = collection.data.query( limit=chunk_size, skip=skip_tracker ) if not process_collection_records: break for item in process_collection_records: if orderby == "keyid": collection_dict[item.get("_key")] = item elif orderby == "object": collection_dict[item.get("object")] = item skip_tracker += chunk_size # Convert to list and set only once at the end collection_records = list(collection_dict.values()) collection_records_keys = set(collection_dict.keys()) # Handle pagination if page_count == 0: last_page = 1 else: total_record_count = len(collection_records) last_page = (total_record_count + page_count - 1) // page_count # Apply pagination to the records start_index = (page - 1) * page_count end_index = page * page_count collection_records = collection_records[start_index:end_index] except Exception as e: msg = f'main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) logging.info( f'context="perf", search_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"' ) return collection_records, collection_records_keys, collection_dict, last_page def search_kv_collection( service, collection_name, page=1, page_count=0, key_filter=None, object_filter=None ): """ Get records from a KVstore collection using a Splunk search. :param service: The Splunk service object. :param collection_name: The name of the collection to query. :param page: The page number to retrieve. :param page_count: The number of records to retrieve per page. :return: A tuple containing the records, keys, a dictionary of the records, and last_page. """ # run the main report, every result is a Splunk search to be executed on its own thread search = f'| inputlookup {collection_name.replace("kv_", "")}' # add filter, if any if key_filter: search += f' where keyid="{key_filter}"' elif object_filter: search += f' where object="{object_filter}"' # complete the search search = f"{search} | eval keyid=_key" # kwargs kwargs_search = { "earliest_time": "-5m", "latest_time": "now", "preview": "false", "output_mode": "json", "count": 0, } collection_records = [] collection_records_keys = set() collection_dict = {} start_time = time.time() try: reader = run_splunk_search( service, search, kwargs_search, 24, 5, ) for item in reader: if isinstance(item, dict): collection_records.append(item) collection_records_keys.add(item.get("keyid")) collection_dict[item.get("keyid")] = item except Exception as e: msg = f'main search failed with exception="{str(e)}"' logging.error(msg) raise Exception(msg) logging.info( f'context="perf", search_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"' ) # if size is 0, we consider all records as one page, simply return everything if page_count == 0: last_page = 1 return collection_records, collection_records_keys, collection_dict, last_page # if size is not 0, we need to paginate else: # calculate the total number of pages total_record_count = len(collection_records) last_page = (total_record_count + page_count - 1) // page_count # calculate the start and end index start_index = (page - 1) * page_count end_index = page * page_count # return the records, keys, dict and last_page return ( collection_records[start_index:end_index], collection_records_keys, collection_dict, last_page, ) def get_full_kv_collection( collection, collection_name, limit=1000, total_record_count=0, multi_threading=False, max_workers=50, ): """ Get all records from a KVstore collection. :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :param limit: The number of records to fetch in each request. :param total_record_count: The total number of records in the collection (if known). :return: A tuple containing the records, keys, and a dictionary of the records. """ collection_records = [] collection_records_keys = set() collection_dict = {} start_time = time.time() def fetch_page(skip): """Helper function to fetch a single page of data.""" try: process_collection_records = collection.data.query(limit=limit, skip=skip) return process_collection_records except Exception as e: logging.error(f"Exception fetching records with skip {skip}: {e}") return [] try: if total_record_count == 0 or not multi_threading: logging.info( f'calling get_full_kv_collection with no multi-threading, collection="{collection_name}", limit="{limit}", total_record_count="{total_record_count}", multi_threading="{multi_threading}"' ) end = False skip_tracker = 0 while end == False: process_collection_records = collection.data.query(skip=skip_tracker) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("_key")) collection_dict[item.get("_key")] = item skip_tracker += limit else: end = True return collection_records, collection_records_keys, collection_dict else: # proceed with multi-threading logging.info( f'calling get_full_kv_collection with multi-threading, collection="{collection_name}", max_workers="{max_workers}"' ) # Prepare to fetch all pages concurrently with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = { executor.submit(fetch_page, skip): skip for skip in range(0, total_record_count, limit) } for future in as_completed(futures): skip = futures[future] try: process_collection_records = future.result() if process_collection_records: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("_key")) collection_dict[item.get("_key")] = item logging.debug( f"Retrieved records with skip {skip}, total={len(process_collection_records)} records" ) except Exception as e: logging.error( f"Exception processing records with skip {skip}: {e}" ) logging.info( f'context="perf", get_full_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"' ) return collection_records, collection_records_keys, collection_dict except Exception as e: logging.error( f"Failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def get_kv_collection( collection, collection_name, total_record_count, page=1, page_count=100 ): """ Get records from a KVstore collection with support for pagination. :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :param total_record_count: Total number of records in the collection. :param page: The page number to retrieve. :param page_count: The number of records to retrieve per page. :return: A tuple containing the records, keys, a dictionary of the records, and last_page. """ start_time = time.time() collection_records = [] collection_records_keys = set() collection_dict = {} # Initialize last_page with a default value last_page = 1 try: if page_count == 0: # Retrieve all records without pagination end = False skip_tracker = 0 while not end: process_collection_records = collection.data.query(skip=skip_tracker) if len(process_collection_records) == 0: end = True else: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("_key")) skip_tracker += limit # If page_count is 0, we consider all records as one page last_page = 1 else: # Pagination logic skip_tracker = (page - 1) * page_count limit = page_count fetched_records = 0 while fetched_records < limit: process_collection_records = collection.data.query( limit=limit, skip=skip_tracker ) if process_collection_records: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("_key")) fetched_records += 1 if fetched_records == limit: break # Stop if we have fetched enough records for the page skip_tracker += limit else: break # End if no more records to fetch # Calculate the total number of pages if total_record_count > 0 and page_count > 0: last_page = (total_record_count + page_count - 1) // page_count logging.info( f'context="perf", KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}", last_page="{last_page}"' ) # Include last_page in the return value return collection_records, collection_records_keys, collection_dict, last_page except Exception as e: logging.error( f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def get_target_from_kv_collection( filter_field, filter_value, collection, collection_name ): """ Get a specific record from a KVstore collection. :param filter_field: The field to filter the record by. :param filter_value: The value to filter the record by. Can be a single value or a list of values. :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :return: A tuple containing the records, keys, and a dictionary of the records. """ collection_records = [] collection_records_keys = set() collection_dict = {} # Handle list of values if isinstance(filter_value, list): query_string = {filter_field: {"$in": filter_value}} else: query_string = {filter_field: filter_value} try: process_collection_records = collection.data.query( query=json.dumps(query_string) ) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("_key")) collection_dict[item.get("_key")] = item return collection_records, collection_records_keys, collection_dict except Exception as e: logging.error( f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def get_full_kv_collection_by_object(collection, collection_name): """ Get all records from a KVstore collection. :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :return: A tuple containing the records, keys, and a dictionary of the records. """ collection_records = [] collection_records_keys = set() collection_dict = {} try: end = False skip_tracker = 0 while end == False: process_collection_records = collection.data.query(skip=skip_tracker) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("object")) collection_dict[item.get("object")] = item skip_tracker += 1000 else: end = True return collection_records, collection_records_keys, collection_dict except Exception as e: logging.error( f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def get_sampling_kv_collection(collection, collection_name): """ Get records from the DSM sampling collection :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :return: A tuple containing the records, keys, and a dictionary of the records. """ collection_records = [] collection_records_keys = set() collection_dict = {} try: end = False skip_tracker = 0 while end == False: process_collection_records = collection.data.query(skip=skip_tracker) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("object")) # add to the dict except for raw_sample collection_dict[item.get("object")] = { k: v for k, v in item.items() if k != "raw_sample" } skip_tracker += 1000 else: end = True return collection_records, collection_records_keys, collection_dict except Exception as e: logging.error( f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def get_collection_documents_count(server_rest_uri, session_key, collection_name): header = { "Authorization": f"Splunk {session_key}", "Content-Type": "application/json", } url = f"{server_rest_uri}/services/server/introspection/kvstore/collectionstats?output_mode=json&count=0" try: response = requests.get( url, headers=header, verify=False, timeout=600, ) if response.status_code not in ( 200, 201, 204, ): error_msg = f'failure to retrieve the KVstore collection document count, response.status_code="{response.status_code}", response.text="{response.text}"' raise Exception(error_msg) else: response_json = response.json() collection_count = 0 entry = response_json["entry"] for item in entry: content = item.get("content") data = content.get("data") for subdata in data: subdata = json.loads(subdata) ns = subdata.get("ns") count = subdata.get("count") if ns == f"trackme.{collection_name}": collection_count = count break return collection_count except Exception as e: logging.error( f'failure to retrieve the KVstore collection document count, exception="{str(e)}"' ) raise Exception(str(e)) def get_wlk_apps_enablement_kv_collection(collection, collection_name): """ Get records from the Wlk apps enablement collection :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :return: A tuple containing the records, keys, and a dictionary of the records. """ collection_records = [] collection_records_keys = set() collection_dict = {} try: end = False skip_tracker = 0 while end == False: process_collection_records = collection.data.query(skip=skip_tracker) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in collection_records_keys: collection_records.append(item) collection_records_keys.add(item.get("app")) # add to the dict except for raw_sample collection_dict[item.get("app")] = item skip_tracker += 1000 else: end = True return collection_records, collection_records_keys, collection_dict except Exception as e: logging.error( f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def get_feeds_datagen_kv_collection(collection, collection_name, component): """ Get all records from a KVstore collection. :param collection: The KVstore collection object. :param collection_name: The name of the collection to query. :return: A tuple containing the records, keys, and a dictionary of the records. """ datagen_collection_records = [] datagen_collection_records_keys = set() datagen_collection_dict = {} datagen_collection_blocklist_not_regex_dict = {} datagen_collection_blocklist_regex_dict = {} try: end = False skip_tracker = 0 while end == False: process_collection_records = collection.data.query(skip=skip_tracker) if len(process_collection_records) != 0: for item in process_collection_records: if item.get("_key") not in datagen_collection_records_keys: datagen_collection_records.append(item) datagen_collection_records_keys.add(item.get("_key")) datagen_collection_dict[item.get("_key")] = item # blocklist if item.get("action") == "block": if item.get("is_rex") == "false": datagen_collection_blocklist_not_regex_dict[ item.get("_key") ] = { "object": item.get("object"), "object_category": item.get("object_category"), } elif item.get("is_rex") == "true": datagen_collection_blocklist_regex_dict[ item.get("_key") ] = { "object": item.get("object"), "object_category": item.get("object_category"), } skip_tracker += 1000 else: end = True return ( datagen_collection_records, datagen_collection_records_keys, datagen_collection_dict, datagen_collection_blocklist_not_regex_dict, datagen_collection_blocklist_regex_dict, ) except Exception as e: logging.error( f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}" ) raise Exception(str(e)) def execute_batch_find_in_chunks(collection, dbqueries, chunk_size=500): """ Executes batch find operations in chunks to adhere to the query limit. :param collection: The collection to query. :param dbqueries: A list of query dictionaries. :param chunk_size: Maximum number of queries per batch operation. :return: A list of kvrecords. """ kvrecords_nested = [] # Process dbqueries in chunks for i in range(0, len(dbqueries), chunk_size): chunk = dbqueries[i : i + chunk_size] try: # Execute batch_find for the current chunk chunk_results = collection.data.batch_find(*chunk) kvrecords_nested.extend(chunk_results) except Exception as e: error_msg = f"Batch find failed for a chunk, exception={str(e)}" logging.error(error_msg) raise Exception(error_msg) return kvrecords_nested def batch_find_records_by_object(collection, object_list): dbqueries = [{"query": {"object": object_value}} for object_value in object_list] try: # Execute batch_find to retrieve records in chunks kvrecords_nested = execute_batch_find_in_chunks(collection, dbqueries) # Flatten the list of lists to get a single list of kvrecords kvrecords = list(itertools.chain.from_iterable(kvrecords_nested)) # Create a dictionary from kvrecords, keying by '_key' kvrecords_dict = {kvrecord["_key"]: kvrecord for kvrecord in kvrecords} # Return the dictionary and the flat list of kvrecords return kvrecords_dict, kvrecords except Exception as e: logging.error( f"Failed to call batch_find_records_by_object, args={object_list}, exception={str(e)}" ) raise Exception(str(e)) def batch_find_records_by_key(collection, keys_list): dbqueries = [{"query": {"_key": key}} for key in keys_list] try: # Execute batch_find to retrieve records in chunks kvrecords_nested = execute_batch_find_in_chunks(collection, dbqueries) # Flatten the list of lists to get a single list of kvrecords kvrecords = list(itertools.chain.from_iterable(kvrecords_nested)) # Create a dictionary from kvrecords, keying by '_key' kvrecords_dict = {kvrecord["_key"]: kvrecord for kvrecord in kvrecords} # Return the dictionary and the flat list of kvrecords return kvrecords_dict, kvrecords except Exception as e: logging.error( f"Failed to call batch_find_records_by_key, args={keys_list}, exception={str(e)}" ) raise Exception(str(e))