Splunk_Deploiement/apps/trackme/lib/trackme_libs_get_data.py

#!/usr/bin/env python
# coding=utf-8

__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"

# Standard library imports
import os
import sys
import time
import logging
import json
import itertools

# Networking and URL handling imports
import requests
from urllib.parse import urlencode
import urllib3

# multithreading
from concurrent.futures import ThreadPoolExecutor, as_completed

# Disable insecure request warnings for urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# splunk home
splunkhome = os.environ["SPLUNK_HOME"]

# append lib
sys.path.append(os.path.join(splunkhome, "etc", "apps", "trackme", "lib"))

# import trackme libs
from trackme_libs import (
    run_splunk_search,
)

# logging:
# To avoid overriding logging destination of callers, the libs will not set on purpose any logging definition
# and rely on callers themselves


def search_kv_collection_restmode(
    logger,
    headers,
    splunkd_uri,
    collection_name,
    page=1,
    page_count=0,
    key_filter=None,
    object_filter=None,
    orderby="keyid",
):
    """
    Get records from a KVstore collection using REST API.

    :param headers: The headers to use for the request.
    :param splunkd_uri: The Splunkd URI.
    :param collection_name: The name of the collection to query.
    :param page: The page number to retrieve.
    :param page_count: The number of records to retrieve per page.
    :param key_filter: The key filter to apply to the query.
    :param object_filter: The object filter to apply to the query.
    :param orderby: The order by field to use for the query.

    :return: A tuple containing the records, keys, a dictionary of the records, and last_page.
    """

    # check orderby argument
    if orderby not in ["keyid", "object"]:
        raise ValueError(f'invalid orderby argument="{orderby}"')

    start_time = time.time()
    collection_dict = {}

    try:
        # Create a session for connection pooling
        with requests.Session() as session:
            session.headers.update(headers)
            session.verify = False

            # Build base URL
            url = f"{splunkd_uri}/servicesNS/nobody/trackme/storage/collections/data/{collection_name}"

            # Add filter if specified
            if key_filter:
                url = f"{url}/{key_filter}"
            elif object_filter:
                query_dict = {"object": {"$eq": object_filter}}
                query = f"?{urlencode({'query': json.dumps(query_dict)})}"
                url = f"{url}{query}"

            # If pagination is needed, use it directly in the request
            if page_count > 0:
                skip = (page - 1) * page_count
                params = {
                    "output_mode": "json",
                    "skip": skip,
                    "limit": page_count,
                }

                # Make the request
                response = session.get(
                    url,
                    params=params,
                    timeout=600,
                )
                response.raise_for_status()
                response_json = response.json()

                # Process results efficiently
                for item in response_json:
                    if orderby == "keyid":
                        key = item.get("_key")
                        if key:  # Only process items with valid keys
                            collection_dict[key] = item
                    elif orderby == "object":
                        object = item.get("object")
                        if object:  # Only process items with valid objects
                            collection_dict[object] = item

            else:
                # For non-paginated requests, fetch all records in chunks
                chunk_size = 10000  # KVstore's default limit
                skip = 0
                while True:
                    params = {
                        "output_mode": "json",
                        "skip": skip,
                        "limit": chunk_size,
                    }

                    # Make the request
                    response = session.get(
                        url,
                        params=params,
                        timeout=600,
                    )
                    response.raise_for_status()
                    response_json = response.json()

                    # If no more records, break the loop
                    if not response_json:
                        break

                    # Process results efficiently
                    for item in response_json:
                        if orderby == "keyid":
                            key = item.get("_key")
                            if key:  # Only process items with valid keys
                                collection_dict[key] = item
                        elif orderby == "object":
                            object = item.get("object")
                            if object:  # Only process items with valid objects
                                collection_dict[object] = item

                    # If we got less than chunk_size records, we've reached the end
                    if len(response_json) < chunk_size:
                        break

                    # Move to next chunk
                    skip += chunk_size

            # Convert to required formats only once
            collection_records = list(collection_dict.values())
            collection_records_keys = set(collection_dict.keys())

            # Handle pagination
            if page_count == 0:
                last_page = 1
            else:
                # Get total count for pagination
                count_url = f"{splunkd_uri}/servicesNS/nobody/trackme/storage/collections/data/{collection_name}/count"
                if object_filter:
                    count_url += f"?{urlencode({'query': json.dumps({'object': {'$eq': object_filter}})})}"

                count_response = session.get(
                    count_url,
                    params={"output_mode": "json"},
                    timeout=600,
                )
                count_response.raise_for_status()
                total_count = count_response.json().get("count", 0)
                last_page = (total_count + page_count - 1) // page_count

    except Exception as e:
        msg = f'REST query failed with exception="{str(e)}"'
        logging.error(msg)
        raise Exception(msg)

    logging.info(
        f'context="perf", search_kv_collection_rest, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"'
    )

    return collection_records, collection_records_keys, collection_dict, last_page


def search_kv_collection_searchmode(
    logger,
    service,
    collection_name,
    page=1,
    page_count=0,
    key_filter=None,
    object_filter=None,
    orderby="keyid",
):
    """
    Get records from a KVstore collection using a Splunk search.

    :param service: The Splunk service object.
    :param collection_name: The name of the collection to query.
    :param page: The page number to retrieve.
    :param page_count: The number of records to retrieve per page.
    :param key_filter: The key filter to apply to the query.
    :param object_filter: The object filter to apply to the query.
    :param orderby: The order by field to use for the query.

    :return: A tuple containing the records, keys, a dictionary of the records, and last_page.
    """

    # check orderby argument
    if orderby not in ["keyid", "object"]:
        raise ValueError(f'invalid orderby argument="{orderby}"')

    start_time = time.time()
    collection_dict = {}

    try:
        # Build the search command efficiently
        search_parts = [f'| inputlookup {collection_name.replace("kv_", "")}']

        # Add filter if specified
        if key_filter:
            search_parts.append(f'where keyid="{key_filter}"')
        elif object_filter:
            search_parts.append(f'where object="{object_filter}"')

        # Add pagination if needed
        if page_count > 0:
            search_parts.append(f"| head {page_count} | tail {page_count}")

        # Complete the search
        search_parts.append("| eval keyid=_key")
        search = " ".join(search_parts)

        # Optimize search parameters
        kwargs_search = {
            "earliest_time": "-5m",
            "latest_time": "now",
            "preview": "false",
            "output_mode": "json",
            "count": 0,
        }

        # Execute search and process results
        reader = run_splunk_search(
            service,
            search,
            kwargs_search,
            24,  # max_retries
            5,  # retry_delay
        )

        # Process results efficiently
        for item in reader:
            if isinstance(item, dict):
                # orderby=keyid
                if orderby == "keyid":
                    key = item.get("keyid")
                    if key:  # Only process items with valid keys
                        collection_dict[key] = item
                elif orderby == "object":
                    object = item.get("object")
                    if object:  # Only process items with valid objects
                        collection_dict[object] = item

        # Convert to required formats only once
        collection_records = list(collection_dict.values())
        collection_records_keys = set(collection_dict.keys())

        # Handle pagination
        if page_count == 0:
            last_page = 1
        else:
            # Get total count for pagination
            count_search = f'| inputlookup {collection_name.replace("kv_", "")}'
            if key_filter:
                count_search += f' where keyid="{key_filter}"'
            elif object_filter:
                count_search += f' where object="{object_filter}"'
            count_search += " | stats count"

            count_reader = run_splunk_search(
                service,
                count_search,
                kwargs_search,
                24,
                5,
            )

            total_count = 0
            for item in count_reader:
                if isinstance(item, dict) and "count" in item:
                    total_count = int(item["count"])
                    break

            last_page = (total_count + page_count - 1) // page_count

    except Exception as e:
        msg = f'main search failed with exception="{str(e)}"'
        logging.error(msg)
        raise Exception(msg)

    logging.info(
        f'context="perf", search_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"'
    )

    return collection_records, collection_records_keys, collection_dict, last_page


def search_kv_collection_sdkmode(
    logger,
    service,
    collection_name,
    page=1,
    page_count=0,
    key_filter=None,
    object_filter=None,
    orderby="keyid",
):
    """
    Get records from a KVstore collection using a Splunk search.

    :param service: The Splunk service object.
    :param collection_name: The name of the collection to query.
    :param page: The page number to retrieve.
    :param page_count: The number of records to retrieve per page.
    :param key_filter: The key filter to apply to the query.
    :param object_filter: The object filter to apply to the query.
    :param orderby: The order by field to use for the query.

    :return: A tuple containing the records, keys, a dictionary of the records, and last_page.
    """

    # check orderby argument
    if orderby not in ["keyid", "object"]:
        raise ValueError(f'invalid orderby argument="{orderby}"')

    start_time = time.time()
    collection_dict = {}

    # connect to the collection
    collection = service.kvstore[collection_name]

    # add filter, if any
    if key_filter:
        query_string = {"keyid": key_filter}
    elif object_filter:
        query_string = {"object": object_filter}
    else:
        query_string = {}

    try:
        if query_string:
            # For filtered queries, we can fetch all matching records at once
            process_collection_records = collection.data.query(
                query=json.dumps(query_string)
            )
            for item in process_collection_records:
                if orderby == "keyid":
                    collection_dict[item.get("_key")] = item
                elif orderby == "object":
                    collection_dict[item.get("object")] = item
        else:
            # For unfiltered queries, we need to use chunked fetching
            chunk_size = 10000  # KVstore's default limit
            skip_tracker = 0
            while True:
                process_collection_records = collection.data.query(
                    limit=chunk_size, skip=skip_tracker
                )
                if not process_collection_records:
                    break

                for item in process_collection_records:
                    if orderby == "keyid":
                        collection_dict[item.get("_key")] = item
                    elif orderby == "object":
                        collection_dict[item.get("object")] = item
                skip_tracker += chunk_size

        # Convert to list and set only once at the end
        collection_records = list(collection_dict.values())
        collection_records_keys = set(collection_dict.keys())

        # Handle pagination
        if page_count == 0:
            last_page = 1
        else:
            total_record_count = len(collection_records)
            last_page = (total_record_count + page_count - 1) // page_count
            # Apply pagination to the records
            start_index = (page - 1) * page_count
            end_index = page * page_count
            collection_records = collection_records[start_index:end_index]

    except Exception as e:
        msg = f'main search failed with exception="{str(e)}"'
        logging.error(msg)
        raise Exception(msg)

    logging.info(
        f'context="perf", search_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"'
    )

    return collection_records, collection_records_keys, collection_dict, last_page


def search_kv_collection(
    service, collection_name, page=1, page_count=0, key_filter=None, object_filter=None
):
    """
    Get records from a KVstore collection using a Splunk search.

    :param service: The Splunk service object.
    :param collection_name: The name of the collection to query.
    :param page: The page number to retrieve.
    :param page_count: The number of records to retrieve per page.

    :return: A tuple containing the records, keys, a dictionary of the records, and last_page.

    """

    # run the main report, every result is a Splunk search to be executed on its own thread
    search = f'| inputlookup {collection_name.replace("kv_", "")}'

    # add filter, if any
    if key_filter:
        search += f' where keyid="{key_filter}"'
    elif object_filter:
        search += f' where object="{object_filter}"'

    # complete the search
    search = f"{search} | eval keyid=_key"

    # kwargs
    kwargs_search = {
        "earliest_time": "-5m",
        "latest_time": "now",
        "preview": "false",
        "output_mode": "json",
        "count": 0,
    }

    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    start_time = time.time()

    try:
        reader = run_splunk_search(
            service,
            search,
            kwargs_search,
            24,
            5,
        )

        for item in reader:
            if isinstance(item, dict):
                collection_records.append(item)
                collection_records_keys.add(item.get("keyid"))
                collection_dict[item.get("keyid")] = item

    except Exception as e:
        msg = f'main search failed with exception="{str(e)}"'
        logging.error(msg)
        raise Exception(msg)

    logging.info(
        f'context="perf", search_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"'
    )

    # if size is 0, we consider all records as one page, simply return everything
    if page_count == 0:
        last_page = 1
        return collection_records, collection_records_keys, collection_dict, last_page

    # if size is not 0, we need to paginate
    else:
        # calculate the total number of pages
        total_record_count = len(collection_records)
        last_page = (total_record_count + page_count - 1) // page_count

        # calculate the start and end index
        start_index = (page - 1) * page_count
        end_index = page * page_count

        # return the records, keys, dict and last_page
        return (
            collection_records[start_index:end_index],
            collection_records_keys,
            collection_dict,
            last_page,
        )


def get_full_kv_collection(
    collection,
    collection_name,
    limit=1000,
    total_record_count=0,
    multi_threading=False,
    max_workers=50,
):
    """
    Get all records from a KVstore collection.

    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.
    :param limit: The number of records to fetch in each request.
    :param total_record_count: The total number of records in the collection (if known).

    :return: A tuple containing the records, keys, and a dictionary of the records.
    """
    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    start_time = time.time()

    def fetch_page(skip):
        """Helper function to fetch a single page of data."""
        try:
            process_collection_records = collection.data.query(limit=limit, skip=skip)
            return process_collection_records
        except Exception as e:
            logging.error(f"Exception fetching records with skip {skip}: {e}")
            return []

    try:

        if total_record_count == 0 or not multi_threading:

            logging.info(
                f'calling get_full_kv_collection with no multi-threading, collection="{collection_name}", limit="{limit}", total_record_count="{total_record_count}", multi_threading="{multi_threading}"'
            )

            end = False
            skip_tracker = 0
            while end == False:
                process_collection_records = collection.data.query(skip=skip_tracker)
                if len(process_collection_records) != 0:
                    for item in process_collection_records:
                        if item.get("_key") not in collection_records_keys:
                            collection_records.append(item)
                            collection_records_keys.add(item.get("_key"))
                            collection_dict[item.get("_key")] = item
                    skip_tracker += limit
                else:
                    end = True

            return collection_records, collection_records_keys, collection_dict

        else:  # proceed with multi-threading

            logging.info(
                f'calling get_full_kv_collection with multi-threading, collection="{collection_name}", max_workers="{max_workers}"'
            )

            # Prepare to fetch all pages concurrently
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = {
                    executor.submit(fetch_page, skip): skip
                    for skip in range(0, total_record_count, limit)
                }

                for future in as_completed(futures):
                    skip = futures[future]
                    try:
                        process_collection_records = future.result()
                        if process_collection_records:
                            for item in process_collection_records:
                                if item.get("_key") not in collection_records_keys:
                                    collection_records.append(item)
                                    collection_records_keys.add(item.get("_key"))
                                    collection_dict[item.get("_key")] = item
                            logging.debug(
                                f"Retrieved records with skip {skip}, total={len(process_collection_records)} records"
                            )
                    except Exception as e:
                        logging.error(
                            f"Exception processing records with skip {skip}: {e}"
                        )

            logging.info(
                f'context="perf", get_full_kv_collection, KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}"'
            )

            return collection_records, collection_records_keys, collection_dict

    except Exception as e:
        logging.error(
            f"Failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def get_kv_collection(
    collection, collection_name, total_record_count, page=1, page_count=100
):
    """
    Get records from a KVstore collection with support for pagination.

    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.
    :param total_record_count: Total number of records in the collection.
    :param page: The page number to retrieve.
    :param page_count: The number of records to retrieve per page.

    :return: A tuple containing the records, keys, a dictionary of the records, and last_page.

    """

    start_time = time.time()
    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    # Initialize last_page with a default value
    last_page = 1

    try:
        if page_count == 0:

            # Retrieve all records without pagination
            end = False
            skip_tracker = 0
            while not end:
                process_collection_records = collection.data.query(skip=skip_tracker)
                if len(process_collection_records) == 0:
                    end = True
                else:
                    for item in process_collection_records:
                        if item.get("_key") not in collection_records_keys:
                            collection_records.append(item)
                            collection_records_keys.add(item.get("_key"))
                    skip_tracker += limit

            # If page_count is 0, we consider all records as one page
            last_page = 1

        else:
            # Pagination logic
            skip_tracker = (page - 1) * page_count
            limit = page_count

            fetched_records = 0
            while fetched_records < limit:
                process_collection_records = collection.data.query(
                    limit=limit, skip=skip_tracker
                )
                if process_collection_records:
                    for item in process_collection_records:
                        if item.get("_key") not in collection_records_keys:
                            collection_records.append(item)
                            collection_records_keys.add(item.get("_key"))
                            fetched_records += 1
                            if fetched_records == limit:
                                break  # Stop if we have fetched enough records for the page
                    skip_tracker += limit
                else:
                    break  # End if no more records to fetch

            # Calculate the total number of pages
            if total_record_count > 0 and page_count > 0:
                last_page = (total_record_count + page_count - 1) // page_count

        logging.info(
            f'context="perf", KVstore select terminated, no_records="{len(collection_records)}", run_time="{round((time.time() - start_time), 3)}", collection="{collection_name}", last_page="{last_page}"'
        )

        # Include last_page in the return value
        return collection_records, collection_records_keys, collection_dict, last_page

    except Exception as e:
        logging.error(
            f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def get_target_from_kv_collection(
    filter_field, filter_value, collection, collection_name
):
    """
    Get a specific record from a KVstore collection.

    :param filter_field: The field to filter the record by.
    :param filter_value: The value to filter the record by. Can be a single value or a list of values.
    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.

    :return: A tuple containing the records, keys, and a dictionary of the records.

    """
    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    # Handle list of values
    if isinstance(filter_value, list):
        query_string = {filter_field: {"$in": filter_value}}
    else:
        query_string = {filter_field: filter_value}

    try:
        process_collection_records = collection.data.query(
            query=json.dumps(query_string)
        )
        if len(process_collection_records) != 0:
            for item in process_collection_records:
                if item.get("_key") not in collection_records_keys:
                    collection_records.append(item)
                    collection_records_keys.add(item.get("_key"))
                    collection_dict[item.get("_key")] = item

        return collection_records, collection_records_keys, collection_dict

    except Exception as e:
        logging.error(
            f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def get_full_kv_collection_by_object(collection, collection_name):
    """
    Get all records from a KVstore collection.

    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.

    :return: A tuple containing the records, keys, and a dictionary of the records.

    """
    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    try:
        end = False
        skip_tracker = 0
        while end == False:
            process_collection_records = collection.data.query(skip=skip_tracker)
            if len(process_collection_records) != 0:
                for item in process_collection_records:
                    if item.get("_key") not in collection_records_keys:
                        collection_records.append(item)
                        collection_records_keys.add(item.get("object"))
                        collection_dict[item.get("object")] = item
                skip_tracker += 1000
            else:
                end = True

        return collection_records, collection_records_keys, collection_dict

    except Exception as e:
        logging.error(
            f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def get_sampling_kv_collection(collection, collection_name):
    """
    Get records from the DSM sampling collection

    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.

    :return: A tuple containing the records, keys, and a dictionary of the records.

    """
    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    try:
        end = False
        skip_tracker = 0
        while end == False:
            process_collection_records = collection.data.query(skip=skip_tracker)
            if len(process_collection_records) != 0:
                for item in process_collection_records:
                    if item.get("_key") not in collection_records_keys:
                        collection_records.append(item)
                        collection_records_keys.add(item.get("object"))
                        # add to the dict except for raw_sample
                        collection_dict[item.get("object")] = {
                            k: v for k, v in item.items() if k != "raw_sample"
                        }
                skip_tracker += 1000
            else:
                end = True

        return collection_records, collection_records_keys, collection_dict

    except Exception as e:
        logging.error(
            f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def get_collection_documents_count(server_rest_uri, session_key, collection_name):

    header = {
        "Authorization": f"Splunk {session_key}",
        "Content-Type": "application/json",
    }
    url = f"{server_rest_uri}/services/server/introspection/kvstore/collectionstats?output_mode=json&count=0"

    try:
        response = requests.get(
            url,
            headers=header,
            verify=False,
            timeout=600,
        )
        if response.status_code not in (
            200,
            201,
            204,
        ):
            error_msg = f'failure to retrieve the KVstore collection document count, response.status_code="{response.status_code}", response.text="{response.text}"'
            raise Exception(error_msg)

        else:
            response_json = response.json()
            collection_count = 0
            entry = response_json["entry"]
            for item in entry:
                content = item.get("content")
                data = content.get("data")
                for subdata in data:
                    subdata = json.loads(subdata)
                    ns = subdata.get("ns")
                    count = subdata.get("count")
                    if ns == f"trackme.{collection_name}":
                        collection_count = count
                        break

            return collection_count

    except Exception as e:
        logging.error(
            f'failure to retrieve the KVstore collection document count, exception="{str(e)}"'
        )
        raise Exception(str(e))


def get_wlk_apps_enablement_kv_collection(collection, collection_name):
    """
    Get records from the Wlk apps enablement collection

    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.

    :return: A tuple containing the records, keys, and a dictionary of the records.

    """
    collection_records = []
    collection_records_keys = set()
    collection_dict = {}

    try:
        end = False
        skip_tracker = 0
        while end == False:
            process_collection_records = collection.data.query(skip=skip_tracker)
            if len(process_collection_records) != 0:
                for item in process_collection_records:
                    if item.get("_key") not in collection_records_keys:
                        collection_records.append(item)
                        collection_records_keys.add(item.get("app"))
                        # add to the dict except for raw_sample
                        collection_dict[item.get("app")] = item
                skip_tracker += 1000
            else:
                end = True

        return collection_records, collection_records_keys, collection_dict

    except Exception as e:
        logging.error(
            f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def get_feeds_datagen_kv_collection(collection, collection_name, component):
    """
    Get all records from a KVstore collection.

    :param collection: The KVstore collection object.
    :param collection_name: The name of the collection to query.

    :return: A tuple containing the records, keys, and a dictionary of the records.

    """
    datagen_collection_records = []
    datagen_collection_records_keys = set()
    datagen_collection_dict = {}

    datagen_collection_blocklist_not_regex_dict = {}
    datagen_collection_blocklist_regex_dict = {}

    try:
        end = False
        skip_tracker = 0
        while end == False:
            process_collection_records = collection.data.query(skip=skip_tracker)
            if len(process_collection_records) != 0:
                for item in process_collection_records:
                    if item.get("_key") not in datagen_collection_records_keys:
                        datagen_collection_records.append(item)
                        datagen_collection_records_keys.add(item.get("_key"))
                        datagen_collection_dict[item.get("_key")] = item

                        # blocklist
                        if item.get("action") == "block":

                            if item.get("is_rex") == "false":
                                datagen_collection_blocklist_not_regex_dict[
                                    item.get("_key")
                                ] = {
                                    "object": item.get("object"),
                                    "object_category": item.get("object_category"),
                                }

                            elif item.get("is_rex") == "true":
                                datagen_collection_blocklist_regex_dict[
                                    item.get("_key")
                                ] = {
                                    "object": item.get("object"),
                                    "object_category": item.get("object_category"),
                                }

                skip_tracker += 1000
            else:
                end = True

        return (
            datagen_collection_records,
            datagen_collection_records_keys,
            datagen_collection_dict,
            datagen_collection_blocklist_not_regex_dict,
            datagen_collection_blocklist_regex_dict,
        )

    except Exception as e:
        logging.error(
            f"failed to call get_kv_collection, args={collection_name}, exception={str(e)}"
        )
        raise Exception(str(e))


def execute_batch_find_in_chunks(collection, dbqueries, chunk_size=500):
    """
    Executes batch find operations in chunks to adhere to the query limit.

    :param collection: The collection to query.
    :param dbqueries: A list of query dictionaries.
    :param chunk_size: Maximum number of queries per batch operation.
    :return: A list of kvrecords.
    """
    kvrecords_nested = []

    # Process dbqueries in chunks
    for i in range(0, len(dbqueries), chunk_size):
        chunk = dbqueries[i : i + chunk_size]
        try:
            # Execute batch_find for the current chunk
            chunk_results = collection.data.batch_find(*chunk)
            kvrecords_nested.extend(chunk_results)
        except Exception as e:
            error_msg = f"Batch find failed for a chunk, exception={str(e)}"
            logging.error(error_msg)
            raise Exception(error_msg)

    return kvrecords_nested


def batch_find_records_by_object(collection, object_list):
    dbqueries = [{"query": {"object": object_value}} for object_value in object_list]

    try:
        # Execute batch_find to retrieve records in chunks
        kvrecords_nested = execute_batch_find_in_chunks(collection, dbqueries)

        # Flatten the list of lists to get a single list of kvrecords
        kvrecords = list(itertools.chain.from_iterable(kvrecords_nested))

        # Create a dictionary from kvrecords, keying by '_key'
        kvrecords_dict = {kvrecord["_key"]: kvrecord for kvrecord in kvrecords}

        # Return the dictionary and the flat list of kvrecords
        return kvrecords_dict, kvrecords

    except Exception as e:
        logging.error(
            f"Failed to call batch_find_records_by_object, args={object_list}, exception={str(e)}"
        )
        raise Exception(str(e))


def batch_find_records_by_key(collection, keys_list):
    dbqueries = [{"query": {"_key": key}} for key in keys_list]

    try:
        # Execute batch_find to retrieve records in chunks
        kvrecords_nested = execute_batch_find_in_chunks(collection, dbqueries)

        # Flatten the list of lists to get a single list of kvrecords
        kvrecords = list(itertools.chain.from_iterable(kvrecords_nested))

        # Create a dictionary from kvrecords, keying by '_key'
        kvrecords_dict = {kvrecord["_key"]: kvrecord for kvrecord in kvrecords}

        # Return the dictionary and the flat list of kvrecords
        return kvrecords_dict, kvrecords

    except Exception as e:
        logging.error(
            f"Failed to call batch_find_records_by_key, args={keys_list}, exception={str(e)}"
        )
        raise Exception(str(e))