Splunk_Deploiement/apps/trackme/bin/trackmesamplingexecutor.py

#!/usr/bin/env python
# coding=utf-8

__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"

# Standard library imports
import os
import sys
import time
import re
import json
import hashlib
import fnmatch

# Logging imports
import logging
from logging.handlers import RotatingFileHandler

# Networking imports
import requests
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# splunk home
splunkhome = os.environ["SPLUNK_HOME"]

# set logging
filehandler = RotatingFileHandler(
    "%s/var/log/splunk/trackme_sampling_executor.log" % splunkhome,
    mode="a",
    maxBytes=10000000,
    backupCount=1,
)
formatter = logging.Formatter(
    "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger()  # root logger - Good to get it only once.
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr, logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)  # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)

# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# import libs
import import_declare_test

# import Splunk libs
from splunklib.searchcommands import (
    dispatch,
    GeneratingCommand,
    Configuration,
    Option,
    validators,
)

# import trackme libs
from trackme_libs import (
    trackme_reqinfo,
    trackme_vtenant_account,
    trackme_register_tenant_object_summary,
    trackme_vtenant_component_info,
    run_splunk_search,
    trackme_handler_events,
)

# import trackme libs croniter
from trackme_libs_croniter import cron_to_seconds

# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces

# import data sampling libs
from trackmedatasampling_ootb_regex import ootb_regex_list

# import TrackMe get data libs
from trackme_libs_get_data import (
    get_full_kv_collection,
)

# import TrackMe feeds libs
from trackme_libs_splk_feeds import (
    trackme_splk_dsm_data_sampling_gen_metrics,
    trackme_splk_dsm_data_sampling_total_run_time_gen_metrics,
)

# import TrackMe decision maker libs
from trackme_libs_decisionmaker import convert_epoch_to_datetime


@Configuration(distributed=False)
class DataSamplingExecutor(GeneratingCommand):
    tenant_id = Option(
        doc="""
        **Syntax:** **tenant_id=****
        **Description:** The tenant identifier.""",
        require=True,
        default=None,
    )

    mode = Option(
        doc="""
        **Syntax:** **mode=****
        **Description:** The data sampling executor mode, valid options are: run_sampling | get_samples""",
        require=False,
        default="run_sampling",
        validate=validators.Match(
            "mode",
            r"^(run_sampling|test_sampling|test_model|get_samples|get_live_samples|show_kvrecord)$",
        ),
    )

    object = Option(
        doc="""
        **Syntax:** **mode=****
        **Description:** The object target, only used if mode is get_samples""",
        require=False,
        default="*",
        validate=validators.Match("object", r"^.*$"),
    )

    earliest = Option(
        doc="""
        **Syntax:** **earliest=****
        **Description:** The earliest time quantifier.""",
        require=False,
        default="-24h",
    )

    latest = Option(
        doc="""
        **Syntax:** **latest=****
        **Description:** The latest time quantifier.""",
        require=False,
        default="now",
    )

    max_runtime = Option(
        doc="""
        **Syntax:** **max_runtime=****
        **Description:** The max runtime for the job in seconds, defaults to 15 minutes less 120 seconds of margin.""",
        require=False,
        default="900",
        validate=validators.Match("max_runtime", r"^\d*$"),
    )

    get_samples_max_count = Option(
        doc="""
        **Syntax:** **get_samples_max_count=****
        **Description:** The max number of events to be sampled in get sample mode, default to 10k events.""",
        require=False,
        default="10000",
        validate=validators.Match("get_samples_max_count", r"^\d*$"),
    )

    regex_expression = Option(
        doc="""
        **Syntax:** **regex_expression=****
        **Description:** If using test_model, the regex expression and model_type should be provided.""",
        require=False,
        default=None,
        validate=validators.Match("regex_expression", r"^.*"),
    )

    model_type = Option(
        doc="""
        **Syntax:** **model_type=****
        **Description:** If using test_model, the regex expression, model_type, model_name and sourcetype_scope should be provided.""",
        require=False,
        default=None,
        validate=validators.Match("model_type", r"^(inclusive|exclusive)$"),
    )

    model_name = Option(
        doc="""
        **Syntax:** **model_name=****
        **Description:** If using test_model, the regex expression, model_type, model_name and sourcetype_scope should be provided.""",
        require=False,
        default=None,
        validate=validators.Match("model_name", r"^.*$"),
    )

    sourcetype_scope = Option(
        doc="""
        **Syntax:** **sourcetype_scope=****
        **Description:** If using test_model, the regex expression, model_type, model_name and sourcetype_scope should be provided.""",
        require=False,
        default=None,
        validate=validators.Match("sourcetype_scope", r"^.*$"),
    )

    """
    Function to return the tenant metric index.
    """

    def get_tenant_metric_idx(self):
        # Define an header for requests authenticated communications with splunkd
        header = {
            "Authorization": "Splunk %s" % self._metadata.searchinfo.session_key,
            "Content-Type": "application/json",
        }

        # get the index conf for this tenant
        url = "%s/services/trackme/v2/vtenants/tenant_idx_settings" % (
            self._metadata.searchinfo.splunkd_uri
        )
        data = {"tenant_id": self.tenant_id, "idx_stanza": "trackme_metric_idx"}

        # Retrieve and set the tenant idx, if any failure, logs and use the global index
        try:
            response = requests.post(
                url,
                headers=header,
                data=json.dumps(data, indent=1),
                verify=False,
                timeout=600,
            )
            if response.status_code not in (200, 201, 204):
                error_msg = f'failed to retrieve the tenant metric index, response.status_code="{response.status_code}", response.text="{response.text}"'
                logging.error(error_msg)
                raise Exception(error_msg)
            else:
                response_data = json.loads(json.dumps(response.json(), indent=1))
                tenant_trackme_metric_idx = response_data["trackme_metric_idx"]
        except Exception as e:
            error_msg = (
                f'failed to retrieve the tenant metric index, exception="{str(e)}"'
            )
            logging.error(error_msg)
            raise Exception(error_msg)

        return tenant_trackme_metric_idx

    """
    Functions to return the entity_info for a given object_id.

    """

    def get_entity_info(self, object_field, value):

        if object_field == "object_id":
            json_data = {
                "tenant_id": self.tenant_id,
                "object_id": value,
            }

        elif object_field == "object":
            json_data = {
                "tenant_id": self.tenant_id,
                "object": value,
            }

        else:
            raise Exception(f'object_field="{object_field}" is not supported')

        try:

            target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/ds_entity_info"

            entity_info_response = requests.post(
                target_url,
                headers={
                    "Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
                    "Content-Type": "application/json",
                },
                verify=False,
                data=json.dumps(json_data),
                timeout=600,
            )

            if entity_info_response.status_code not in (200, 201, 204):
                error_msg = f'failed to retrieve the entity info, data="{json.dumps(json_data, indent=2)}", response.status_code="{entity_info_response.status_code}", response.text="{entity_info_response.text}"'
                logging.error(error_msg)
                raise Exception(error_msg)

            else:
                return entity_info_response.json()

        except Exception as e:
            error_msg = f'tenant_id="{self.tenant_id}", function get_entity_info, object requested using object_field="{object_field}" with value="{value}" could not be found, exception="{str(e)}"'
            logging.error(error_msg)
            raise Exception(error_msg)

    """
    Function to get sampling system settings
    """

    def get_sampling_system_settings(self, reqinfo):

        # Minimum time in seconds between two iterations of sampling per entity
        splk_data_sampling_min_time_btw_iterations_seconds = int(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_min_time_btw_iterations_seconds"
            ]
        )

        # number of records to be sampled per entity
        splk_data_sampling_no_records_per_entity = int(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_no_records_per_entity"
            ]
        )

        # number of records to be stored in the KVstore for inspection purposes
        splk_data_sampling_no_records_saved_kvrecord = int(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_no_records_saved_kvrecord"
            ]
        )

        # max char size of the raw sample to be stored in the KVstore
        splk_data_sampling_records_kvrecord_truncate_size = int(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_records_kvrecord_truncate_size"
            ]
        )

        # Min inclusive model matched percentage (float)
        splk_data_sampling_pct_min_major_inclusive_model_match = float(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_pct_min_major_inclusive_model_match"
            ]
        )

        # Max exclusive model matched percentage (float)
        splk_data_sampling_pct_max_exclusive_model_match = float(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_pct_max_exclusive_model_match"
            ]
        )

        # The relative time window size in seconds
        splk_data_sampling_relative_time_window_seconds = int(
            reqinfo["trackme_conf"]["splk_data_sampling"][
                "splk_data_sampling_relative_time_window_seconds"
            ]
        )

        return (
            splk_data_sampling_min_time_btw_iterations_seconds,
            splk_data_sampling_no_records_per_entity,
            splk_data_sampling_no_records_saved_kvrecord,
            splk_data_sampling_records_kvrecord_truncate_size,
            splk_data_sampling_pct_min_major_inclusive_model_match,
            splk_data_sampling_pct_max_exclusive_model_match,
            splk_data_sampling_relative_time_window_seconds,
        )

    """
    Function to get sampling entity settings
    """

    def get_sampling_entity_settings(
        self,
        kvrecord,
        splk_data_sampling_pct_min_major_inclusive_model_match,
        splk_data_sampling_pct_max_exclusive_model_match,
        splk_data_sampling_min_time_btw_iterations_seconds,
        splk_data_sampling_no_records_per_entity,
        splk_data_sampling_relative_time_window_seconds,
    ):

        # min inclusive model matched percentage
        try:
            pct_min_major_inclusive_model_match = float(
                kvrecord.get(
                    "pct_min_major_inclusive_model_match",
                    splk_data_sampling_pct_min_major_inclusive_model_match,
                )
            )
        except Exception as e:
            pct_min_major_inclusive_model_match = (
                splk_data_sampling_pct_min_major_inclusive_model_match
            )

        # max exclusive model matched percentage
        try:
            pct_max_exclusive_model_match = float(
                kvrecord.get(
                    "pct_max_exclusive_model_match",
                    splk_data_sampling_pct_max_exclusive_model_match,
                )
            )
        except Exception as e:
            pct_max_exclusive_model_match = (
                splk_data_sampling_pct_max_exclusive_model_match
            )

        # Minimum time in seconds between two iterations of sampling per entity
        try:
            min_time_btw_iterations_seconds = int(
                kvrecord.get(
                    "min_time_btw_iterations_seconds",
                    splk_data_sampling_min_time_btw_iterations_seconds,
                )
            )
        except Exception as e:
            min_time_btw_iterations_seconds = (
                splk_data_sampling_min_time_btw_iterations_seconds
            )

        # max_events_per_sampling_iteration (integer)
        try:
            max_events_per_sampling_iteration = int(
                kvrecord.get(
                    "max_events_per_sampling_iteration",
                    splk_data_sampling_no_records_per_entity,
                )
            )
        except Exception as e:
            max_events_per_sampling_iteration = splk_data_sampling_no_records_per_entity

        # relative_time_window_seconds (integer)
        try:
            relative_time_window_seconds = int(
                kvrecord.get(
                    "relative_time_window_seconds",
                    splk_data_sampling_relative_time_window_seconds,
                )
            )
        except Exception as e:
            relative_time_window_seconds = (
                splk_data_sampling_relative_time_window_seconds
            )

        return (
            pct_min_major_inclusive_model_match,
            pct_max_exclusive_model_match,
            min_time_btw_iterations_seconds,
            max_events_per_sampling_iteration,
            relative_time_window_seconds,
        )

    """
    Function to get the upstream search definition
    """

    def get_upstream_search_definition(
        self, splk_data_sampling_relative_time_window_seconds
    ):

        if self.object != "*":
            upstream_search_string = remove_leading_spaces(
                f"""\
                | inputlookup trackme_dsm_tenant_{self.tenant_id} where object="{self.object}"
                | eval key=_key
                | lookup trackme_dsm_data_sampling_tenant_{self.tenant_id} object OUTPUT data_sample_feature, relative_time_window_seconds, data_sample_last_entity_epoch_processed
                | fields object, key, data_last_time_seen, *
                | eval earliest_target=if(isnum(relative_time_window_seconds), data_last_time_seen-relative_time_window_seconds, data_last_time_seen-{splk_data_sampling_relative_time_window_seconds})
                | eval latest_target=if(isnum(relative_time_window_seconds), earliest_target+relative_time_window_seconds, earliest_target+{splk_data_sampling_relative_time_window_seconds})
                """
            )

        else:
            upstream_search_string = remove_leading_spaces(
                f"""\
                | inputlookup trackme_dsm_tenant_{self.tenant_id} where monitored_state="enabled"
                | eval key=_key
                | `trackme_exclude_badentities`
                | where data_last_time_seen>relative_time(now(), "-24h")
                | lookup trackme_dsm_data_sampling_tenant_{self.tenant_id} object OUTPUT data_sample_feature, relative_time_window_seconds, data_sample_last_entity_epoch_processed, min_time_btw_iterations_seconds, data_sample_mtime
                ``` only consider entities where the last processed epoch (data_sample_last_entity_epoch_processed) is older than data_last_time_seen, or null (entities has not been processed yet) ```
                | where (isnull(data_sample_last_entity_epoch_processed) OR data_sample_last_entity_epoch_processed<data_last_time_seen)
                | eval data_sample_feature=if(isnull(data_sample_feature), "enabled", data_sample_feature) | where (data_sample_feature!="disabled")
                ``` only consider entities where the min_time_btw_iterations_seconds is older than the current time (bigger or equal to the time spent since last run, or null for new entities) ```
                | eval time_spent_since_last_run=now()-data_sample_mtime
                | where (isnull(min_time_btw_iterations_seconds) OR time_spent_since_last_run>=min_time_btw_iterations_seconds)
                ``` define a priority rank, entities that have been set as disabled_auto should be processed last compared to entities in disabled_audo ```
                | eval priority_rank=if(data_sample_feature=="enabled", 1, 2)
                ``` order ```
                | sort limit=0 priority_rank, data_sample_mtime
                | fields object, key, data_last_time_seen, *
                | eval earliest_target=if(isnum(relative_time_window_seconds), data_last_time_seen-relative_time_window_seconds, data_last_time_seen-{splk_data_sampling_relative_time_window_seconds})
                | eval latest_target=if(isnum(relative_time_window_seconds), earliest_target+relative_time_window_seconds, earliest_target+{splk_data_sampling_relative_time_window_seconds})
                """
            )

        logging.debug(f'upstream_search_string="{upstream_search_string}"')

        return upstream_search_string

    """
    Function to return the models for test
    """

    def get_test_models(self):

        merged_models_inclusive = []
        merged_models_exclusive = []

        if self.model_type == "inclusive":
            # append the test model to the inclusive list
            merged_models_inclusive.append(
                {
                    "model_name": self.model_name,
                    "model_regex": self.regex_expression,
                    "model_type": self.model_type,
                    "model_id": hashlib.sha256(
                        self.model_name.encode("utf-8")
                    ).hexdigest(),
                    "sourcetype_scope": self.sourcetype_scope,
                }
            )
        elif self.model_type == "exclusive":
            # append the test model to the exclusive list
            merged_models_exclusive.append(
                {
                    "model_name": self.model_name,
                    "model_regex": self.regex_expression,
                    "model_type": self.model_type,
                    "model_id": hashlib.sha256(
                        self.model_name.encode("utf-8")
                    ).hexdigest(),
                    "sourcetype_scope": self.sourcetype_scope,
                }
            )

        return merged_models_inclusive, merged_models_exclusive

    """
    Function to return the models for run
    """

    def get_run_models(self, custom_models_records):

        merged_models_inclusive = []
        merged_models_exclusive = []

        for custom_model in custom_models_records:
            model_name = custom_model.get("model_name")
            model_regex = custom_model.get("model_regex")
            model_type = custom_model.get("model_type")
            model_id = custom_model.get("model_id")
            sourcetype_scope = custom_model.get("sourcetype_scope")

            if model_type == "inclusive":
                merged_models_inclusive.append(
                    {
                        "model_name": model_name,
                        "model_regex": model_regex,
                        "model_type": model_type,
                        "model_id": model_id,
                        "sourcetype_scope": sourcetype_scope,
                    }
                )

            elif model_type == "exclusive":
                merged_models_exclusive.append(
                    {
                        "model_name": model_name,
                        "model_regex": model_regex,
                        "model_type": model_type,
                        "model_id": model_id,
                        "sourcetype_scope": sourcetype_scope,
                    }
                )

        # Append ootb models to the inclusive list
        for ootb_model in ootb_regex_list:
            model_name = ootb_model.get("label")
            model_regex = ootb_model.get("regex")
            merged_models_inclusive.append(
                {
                    "model_name": model_name,
                    "model_regex": model_regex,
                    "model_type": "inclusive",
                    "model_id": hashlib.sha256(model_name.encode("utf-8")).hexdigest(),
                    "sourcetype_scope": "*",
                }
            )

        return merged_models_inclusive, merged_models_exclusive

    """
    Function to call the disable sampling endpoint
    """

    def disable_sampling(self, object_key, object_value, reason):

        try:
            json_data = {
                "tenant_id": self.tenant_id,
                "keys_list": object_key,
                "action": "disable",
                "update_comment": reason,
            }
            target_url = f"{self._metadata.searchinfo.splunkd_uri}/services/trackme/v2/splk_dsm/write/ds_manage_data_sampling"

            response = requests.post(
                target_url,
                headers={
                    "Authorization": f"Splunk {self._metadata.searchinfo.session_key}",
                    "Content-Type": "application/json",
                },
                verify=False,
                data=json.dumps(json_data),
                timeout=600,
            )

            if response.status_code in (200, 201, 204):
                logging.info(
                    f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", auto-disablement of sampling was successful, response="{response.text}"'
                )
                return True

            else:
                logging.error(
                    f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", could not disable data sampling, response.status_code="{response.status_code}", response="{response.text}"'
                )
                return False

        except Exception as e:
            logging.error(
                f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", could not disable data sampling, exception="{str(e)}"'
            )
            return False

    """
    Function to init entity metadata
    """

    def init_entity_metadata(self, kvrecord):

        current_detected_format = []
        current_detected_format_dcount = 0
        current_detected_format_id = []
        current_detected_major_format = None

        # previous key information are stored as current_<key> in the record
        previous_detected_format = kvrecord.get("current_detected_format", [])
        previous_detected_format_dcount = kvrecord.get(
            "current_detected_format_dcount", 0
        )
        previous_detected_format_id = kvrecord.get("current_detected_format_id", [])
        previous_detected_major_format = kvrecord.get(
            "current_detected_major_format", None
        )

        data_sample_anomaly_detected = kvrecord.get(
            "data_sample_anomaly_detected", False
        )

        data_sample_anomaly_reason = kvrecord.get("data_sample_anomaly_reason", "N/A")

        data_sample_feature = kvrecord.get("data_sample_feature", "enabled")

        data_sample_iteration = kvrecord.get("data_sample_iteration", None)
        if not data_sample_iteration:
            data_sample_iteration = 1
        else:
            data_sample_iteration = int(data_sample_iteration)
            data_sample_iteration += 1

        data_sample_mtime = kvrecord.get("data_sample_mtime", time.time())
        data_sample_status_colour = None
        data_sample_status_message = {}
        multiformat_detected = False
        exclusive_match_anomaly = False

        # return
        return (
            current_detected_format,
            current_detected_format_dcount,
            current_detected_format_id,
            current_detected_major_format,
            previous_detected_format,
            previous_detected_format_dcount,
            previous_detected_format_id,
            previous_detected_major_format,
            data_sample_anomaly_detected,
            data_sample_anomaly_reason,
            data_sample_feature,
            data_sample_iteration,
            data_sample_mtime,
            data_sample_status_colour,
            data_sample_status_message,
            multiformat_detected,
            exclusive_match_anomaly,
        )

    """
    Function to return entity_search_string
    """

    def get_entity_search_string(
        self,
        entity_info,
        object_value,
        object_key,
        splk_dsm_sampling_search,
        splk_data_sampling_no_records_per_entity,
    ):

        # handle number of records to be sampled per entity
        if self.mode in ("run_sampling", "test_sampling"):

            # replace the number of records to be sampled
            if entity_info.get("account") != "local":
                search_string = splk_dsm_sampling_search.replace(
                    "head 1000",
                    f"head {splk_data_sampling_no_records_per_entity}",
                )
            else:
                search_string = (
                    splk_dsm_sampling_search
                    + f" | head {splk_data_sampling_no_records_per_entity}"
                )

            # add the key
            search_string = remove_leading_spaces(
                f"""\
                {search_string}
                | eval key="{str(object_key)}", object="{str(object_value)}"
                | rename _raw as raw_sample, sourcetype as data_sourcetype
                | table key, object, data_sourcetype, raw_sample
                """
            )

        elif self.mode == "test_model":

            # replace the number of records to be sampled
            if entity_info.get("account") != "local":
                search_string = splk_dsm_sampling_search.replace(
                    "head 1000",
                    f"head {self.get_samples_max_count}",
                )
            else:
                search_string = (
                    f"{splk_dsm_sampling_search} | head {self.get_samples_max_count}"
                )

            # add the key
            search_string = remove_leading_spaces(
                f"""\
                {search_string}
                | eval key="{object_key}", object="{object_value}"
                | rename _raw as raw_sample, sourcetype as data_sourcetype
                | table key, object, data_sourcetype, raw_sample
                """
            )

            logging.debug(f'splk_dsm_sampling_search="{splk_dsm_sampling_search}"')

        return search_string

    """
    Function to return the entity search kwargs
    """

    def get_entity_search_kwargs(
        self, object_value, object_key, search_string, earliest_target, latest_target
    ):

        # in mode run_sampling and test_sampling, we use the earliest_target
        if self.mode in ("run_sampling", "test_sampling"):
            kwargs_samplesearch = {
                "earliest_time": earliest_target,
                "latest_time": latest_target,
                "count": 0,
                "output_mode": "json",
            }
            logging.info(
                f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", Executing data sampling resulting search="{search_string}", earliest="{earliest_target}", latest="{latest_target}"'
            )

        # in mode test_model, we use the earliest and latest
        elif self.mode == "test_model":
            kwargs_samplesearch = {
                "earliest_time": self.earliest,
                "latest_time": self.latest,
                "count": 0,
                "output_mode": "json",
            }
            logging.info(
                f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", Executing data sampling resulting search="{search_string}", earliest="{earliest_target}", latest="{latest_target}"'
            )

        return kwargs_samplesearch

    """
    Function to retrieve the sampling kvrecord
    """

    def get_sampling_kvrecord(self, collection, object_field, object_value):

        # check if we have a KVrecord already for this object
        query_string = {
            "$and": [
                {
                    object_field: object_value,
                }
            ]
        }
        try:
            # try get to get the key
            kvrecord = collection.data.query(query=(json.dumps(query_string)))[0]
            key = kvrecord.get("_key")
        except Exception as e:
            kvrecord = {}
            key = None

        return kvrecord, key

    def generate(self, **kwargs):

        # performance counter
        start = time.time()

        # Track execution times
        average_execution_time = 0

        # Get request info and set logging level
        reqinfo = trackme_reqinfo(
            self._metadata.searchinfo.session_key,
            self._metadata.searchinfo.splunkd_uri,
        )
        log.setLevel(reqinfo["logging_level"])

        # Get Virtual Tenant account
        vtenant_account = trackme_vtenant_account(
            self._metadata.searchinfo.session_key,
            self._metadata.searchinfo.splunkd_uri,
            self.tenant_id,
        )

        # if mode is test_model, regex_expression should be provided
        if self.mode == "test_model":
            if (
                not self.regex_expression
                or not self.model_type
                or not self.model_name
                or not self.sourcetype_scope
            ):
                raise Exception(
                    f'if mode is test_model, the regex expression, model_type, model_name and sourcetype_scope, mode="{self.mode}", regex_expression="{self.regex_expression}", model_type="{self.model_type}", model_name="{self.model_name}", sourcetype_scope="{self.sourcetype_scope}"'
                )

        # get metric index
        metric_index = self.get_tenant_metric_idx()

        # Retrieve custom models, if any.
        custom_models_collection_name = (
            f"kv_trackme_dsm_data_sampling_custom_models_tenant_{self.tenant_id}"
        )
        custom_models_collection = self.service.kvstore[custom_models_collection_name]
        (
            custom_models_records,
            custom_models_collection_keys,
            custom_models_collection_dict,
        ) = get_full_kv_collection(
            custom_models_collection, custom_models_collection_name
        )
        logging.debug(
            f'custom_models_records="{json.dumps(custom_models_records, indent=2)}"'
        )

        #
        # Step: merge the custom models with the OOTB models
        #

        merged_models_inclusive = []
        merged_models_exclusive = []

        if self.mode == "test_model":
            merged_models_inclusive, merged_models_exclusive = self.get_test_models()

        else:
            merged_models_inclusive, merged_models_exclusive = self.get_run_models(
                custom_models_records
            )

        logging.debug(
            f'merged_models_inclusive="{json.dumps(merged_models_inclusive, indent=2)}"'
        )

        logging.debug(
            f'merged_models_exclusive="{json.dumps(merged_models_exclusive, indent=2)}"'
        )

        # max runtime
        max_runtime = int(self.max_runtime)

        # Retrieve the search cron schedule
        savedsearch_name = f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}"
        savedsearch = self.service.saved_searches[savedsearch_name]
        savedsearch_cron_schedule = savedsearch.content["cron_schedule"]

        # get the cron_exec_sequence_sec
        try:
            cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
        except Exception as e:
            logging.error(
                f'tenant_id="{self.tenant_id}", component="splk-dsm", failed to convert the cron schedule to seconds, error="{str(e)}"'
            )
            cron_exec_sequence_sec = max_runtime

        # the max_runtime cannot be bigger than the cron_exec_sequence_sec
        if max_runtime > cron_exec_sequence_sec:
            max_runtime = cron_exec_sequence_sec

        logging.info(
            f'tenant_id={self.tenant_id}, max_runtime="{max_runtime}",  savedsearch_name="{savedsearch_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
        )

        #
        # system wide settings for data sampling
        #

        (
            splk_data_sampling_min_time_btw_iterations_seconds,
            splk_data_sampling_no_records_per_entity,
            splk_data_sampling_no_records_saved_kvrecord,
            splk_data_sampling_records_kvrecord_truncate_size,
            splk_data_sampling_pct_min_major_inclusive_model_match,
            splk_data_sampling_pct_max_exclusive_model_match,
            splk_data_sampling_relative_time_window_seconds,
        ) = self.get_sampling_system_settings(reqinfo)

        # init
        upstream_search_string = None

        # counter
        count = 0

        # Get the session key
        session_key = self._metadata.searchinfo.session_key

        # Data collection
        collection_name = f"kv_trackme_dsm_data_sampling_tenant_{self.tenant_id}"
        collection = self.service.kvstore[collection_name]

        # get the upstream search definition
        upstream_search_string = self.get_upstream_search_definition(
            splk_data_sampling_relative_time_window_seconds
        )

        # Set kwargs
        kwargs_upstream_search = {
            "earliest_time": self.earliest,
            "latest_time": self.latest,
            "count": 0,
            "output_mode": "json",
        }

        logging.info(
            f'tenant_id={self.tenant_id}, Executing upstream definition search to define the list of entities to be sampled by order of priority, search="{upstream_search_string}"'
        )

        # get vtenant component info
        vtenant_component_info = trackme_vtenant_component_info(
            session_key,
            self._metadata.searchinfo.splunkd_uri,
            self.tenant_id,
        )
        logging.debug(
            f'vtenant_component_info="{json.dumps(vtenant_component_info, indent=2)}"'
        )

        # get sampling, if set 0 then sampling is disabled for the tenant, 1 we can proceed
        sampling_feature_enabled = True
        try:
            if int(vtenant_account.get("sampling")) == 0:
                sampling_feature_enabled = False
        except Exception as e:
            sampling_feature_enabled = True

        # check schema version migration state
        try:
            schema_version = int(vtenant_component_info["schema_version"])
            schema_version_upgrade_in_progress = bool(
                int(vtenant_component_info["schema_version_upgrade_in_progress"])
            )
            logging.debug(
                f'schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"'
            )
        except Exception as e:
            schema_version = 0
            schema_version_upgrade_in_progress = False
            logging.error(
                f'failed to retrieve schema_version_upgrade_in_progress=, exception="{str(e)}"'
            )

        # Do not proceed if the schema version upgrade is in progress
        if schema_version_upgrade_in_progress:
            yield_json = {
                "_time": time.time(),
                "tenant_id": self.tenant_id,
                "response": f'tenant_id="{self.tenant_id}", schema upgrade is currently in progress, we will wait until the process is completed before proceeding, the schema upgrade is handled by the health_tracker of the tenant and is completed once the schema_version field of the Virtual Tenants KVstore (trackme_virtual_tenants) matches TrackMe\'s version, schema_version="{schema_version}", schema_version_upgrade_in_progress="{schema_version_upgrade_in_progress}"',
                "schema_version": schema_version,
                "schema_version_upgrade_in_progress": schema_version_upgrade_in_progress,
            }
            logging.info(
                f"tenant_id={self.tenant_id}, {json.dumps(yield_json, indent=2)}"
            )
            yield {
                "_time": yield_json["_time"],
                "_raw": yield_json,
            }

        # Do not proceed if the sampling feature is disabled
        if not sampling_feature_enabled:
            yield_json = {
                "_time": time.time(),
                "tenant_id": self.tenant_id,
                "response": f'tenant_id="{self.tenant_id}", data sampling feature is disabled for this tenant, sampling="{sampling_feature_enabled}"',
                "sampling_feature_enabled": sampling_feature_enabled,
            }
            logging.info(
                f"tenant_id={self.tenant_id}, {json.dumps(yield_json, indent=2)}"
            )
            yield {
                "_time": yield_json["_time"],
                "_raw": yield_json,
            }

        # available modes
        # - run_sampling: run the full sampling process, as expected per schedule
        # - test_model: test a model against a sample event
        # - test_sampling: same as run_sampling but dot not update the KVstore, used for testing purposes
        # - get_samples: get samples for simulation or inline search purposes (from KVstore)
        # - get_live_samples: get samples for simulation or inline search purposes (from live data)

        if self.mode in ("get_samples", "get_live_samples", "show_kvrecord"):

            # object is required
            if not self.object or self.object == "*":
                raise Exception(f"object is required in mode={self.mode}")

            # get the kvrecord and key
            kvrecord, key = self.get_sampling_kvrecord(
                collection, "object", self.object
            )

            # run the main report, every result is a Splunk search to be executed on its own thread
            if not key:
                raise Exception(
                    "this entity was not found in the collection or data sampling has not been executed yet for this entity."
                )

            #
            # run
            #

            if self.mode in ("get_live_samples"):

                # get the entity info
                try:
                    entity_info = self.get_entity_info("object", self.object)
                except Exception as e:
                    raise Exception(
                        f'function get_entity_info, called with arguments: object_field="object", object_value="{self.object}", could not retrieve entity info data sampling search, this entity was not found, exception="{str(e)}"'
                    )

                #
                # from entity_info, get splk_dsm_sampling_search and inspect the type of entity
                #

                splk_dsm_sampling_search = entity_info.get(
                    "splk_dsm_sampling_search", None
                )

                # run the main report, every result is a Splunk search to be executed on its own thread
                if not splk_dsm_sampling_search:
                    raise Exception(
                        "could not retrieve entity info data sampling search, this entity was not found"
                    )

                else:
                    # replace the number of records to be sampled
                    if entity_info.get("account") == "local":
                        live_sample_search_string = f"{splk_dsm_sampling_search} | head {self.get_samples_max_count}"

                    else:
                        live_sample_search_string = splk_dsm_sampling_search.replace(
                            "head 1000",
                            f"head {self.get_samples_max_count}",
                        )

                    # add the key
                    live_sample_search_string = remove_leading_spaces(
                        f"""\
                        {live_sample_search_string}
                        | eval key="{str(key)}", object="{str(self.object)}"
                        | rename _raw as raw_sample, sourcetype as data_sourcetype
                        """
                    )

                    # Set kwargs
                    kwargs_live_sample_search = {
                        "earliest_time": self.earliest,
                        "latest_time": self.latest,
                        "count": 0,
                        "output_mode": "json",
                    }

                    try:
                        subreader = run_splunk_search(
                            self.service,
                            live_sample_search_string,
                            kwargs_live_sample_search,
                            24,
                            5,
                        )

                        for item in subreader:
                            if isinstance(item, dict):
                                logging.debug(f'search_results="{item}"')

                                raw_sample = item.get("raw_sample")
                                if raw_sample:
                                    raw_sample = raw_sample.rstrip(
                                        "\n"
                                    )  # Removes the newline only if it's at the end
                                    item["raw_sample"] = raw_sample

                                data_sourcetype = item.get("data_sourcetype")
                                # if data_sourcetype is a list, take the first element
                                if isinstance(data_sourcetype, list):
                                    data_sourcetype = data_sourcetype[0]

                                data = {
                                    "_time": time.time(),
                                    "key": item.get("key"),
                                    "object": item.get("object"),
                                    "_raw": raw_sample,
                                    "data_sourcetype": data_sourcetype,
                                }
                                yield data

                    except Exception as e:
                        logging.error(
                            f'tenant_id="{self.tenant_id}" search failed with exception="{str(e)}"'
                        )

            elif self.mode == "show_kvrecord":

                # yield the kvrecord
                yield_record = {}
                yield_record["_time"] = time.time()
                for k, v in kvrecord.items():
                    yield_record[k] = v
                yield_record["_raw"] = json.dumps(kvrecord)

                yield yield_record

            elif self.mode == "get_samples":

                # get the raw_sample_list
                raw_sample_list = kvrecord.get("raw_sample")

                # loop through the raw_sample_list
                for record in raw_sample_list:

                    # load as an object
                    record = json.loads(record)

                    # yield the kvrecord
                    yield_record = {}
                    yield_record["_time"] = time.time()
                    for k, v in record.items():
                        yield_record[k] = v
                    yield_record["_raw"] = json.dumps(record)

                    yield yield_record

        elif (
            self.mode in ("run_sampling", "test_sampling", "test_model")
            and not schema_version_upgrade_in_progress
            and sampling_feature_enabled
        ):
            # report name for logging purposes
            report_name = f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}"

            # run the main report, every result is a Splunk search to be executed on its own thread
            objects_list = []

            # From the vtenant account, get the value of Sampling obfuscation
            data_sampling_obfuscation = vtenant_component_info.get(
                "data_sampling_obfuscation"
            )

            #
            # run the upstream search
            #

            try:
                reader = run_splunk_search(
                    self.service,
                    upstream_search_string,
                    kwargs_upstream_search,
                    24,
                    5,
                )

                for item in reader:
                    if isinstance(item, dict):
                        logging.debug(f'search_results="{item}"')
                        # append to the list of searches

                        objects_list.append(
                            {
                                "object": item.get("object"),
                                "key": item.get("key"),
                                "earliest_target": item.get("earliest_target"),
                                "latest_target": item.get("latest_target"),
                                "data_last_time_seen": item.get("data_last_time_seen"),
                            }
                        )

            except Exception as e:

                if self.mode == "run_sampling":
                    # Call the component register
                    trackme_register_tenant_object_summary(
                        session_key,
                        self._metadata.searchinfo.splunkd_uri,
                        self.tenant_id,
                        "splk-dsm",
                        f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}",
                        "failure",
                        time.time(),
                        str(time.time() - start),
                        str(e),
                        "-24h",
                        "now",
                    )
                msg = f'tenant_id="{self.tenant_id}", component="splk-dsm", search failed with exception="{str(e)}"'
                logging.error(msg)
                raise Exception(
                    msg
                )  # if failed, the jobs exists and is tagged as failed in the component register

            # loop
            logging.debug(f'objects_list="{json.dumps(objects_list, indent=2)}"')

            # Initialize sum of execution times and count of iterations
            total_execution_time = 0
            iteration_count = 0

            # Other initializations
            max_runtime = int(self.max_runtime)
            entities_count = 0

            #
            # Loop through entities to be processed
            #

            for object_dict in objects_list:
                entities_count += 1
                object_value = object_dict.get("object")
                object_key = object_dict.get("key")
                earliest_target = object_dict.get("earliest_target")
                latest_target = object_dict.get("latest_target")
                data_last_time_seen = object_dict.get("data_last_time_seen")

                logging.info(
                    f'tenant_id="{self.tenant_id}", processing entity object="{object_value}", object_id="{object_key}"'
                )

                # iteration start
                iteration_start_time = time.time()

                # get the kvrecord and key
                kvrecord, key = self.get_sampling_kvrecord(
                    collection, "_key", object_key
                )

                #
                # is_eligible boolean, is_eligible_reason string
                #

                is_eligible = True
                is_eligible_reason = "N/A"

                #
                # entity info
                #

                # get the entity info
                try:
                    entity_info = self.get_entity_info("object_id", object_key)
                except Exception as e:
                    entity_info = {}

                #
                # from entity_info, get splk_dsm_sampling_search and inspect the type of entity
                #

                splk_dsm_sampling_search = entity_info.get(
                    "splk_dsm_sampling_search", None
                )
                if splk_dsm_sampling_search:  # handle if N/A
                    if splk_dsm_sampling_search == "N/A":
                        splk_dsm_sampling_search = None

                is_elastic = int(entity_info.get("is_elastic", 0))
                search_mode = entity_info.get("search_mode", "unknown")

                logging.debug(
                    f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", splk_dsm_sampling_search="{splk_dsm_sampling_search}", is_elastic="{is_elastic}", search_mode="{search_mode}"'
                )

                # inspect the entity type

                if is_elastic == 1 and search_mode in (
                    "mstats",
                    "mpreview",
                    "from",
                ):

                    # disable sampling for non eligible elastic search entities
                    is_eligible = False
                    is_eligible_reason = "elastic_search_entity"

                    logging.info(
                        f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", is_eligible="{is_eligible}", is_eligible_reason="{is_eligible_reason}", processing with auto-disablement of sampling'
                    )

                    self.disable_sampling(
                        object_key,
                        object_value,
                        "auto-disablement of sampling for elastic search entities",
                    )

                elif not splk_dsm_sampling_search or splk_dsm_sampling_search == "N/A":

                    # disable sampling for entities returning non available sampling search
                    is_eligible = False
                    is_eligible_reason = "no_sampling_search"

                    logging.info(
                        f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{object_key}", is_eligible="{is_eligible}", is_eligible_reason="{is_eligible_reason}", processing with auto-disablement of sampling'
                    )

                    self.disable_sampling(
                        object_key,
                        object_value,
                        "auto-disablement of sampling for entities without sampling search identified",
                    )

                #
                # process entity sampling
                #

                if not is_eligible:
                    continue  # stop processing this entity

                # get the entity settings
                (
                    pct_min_major_inclusive_model_match,
                    pct_max_exclusive_model_match,
                    min_time_btw_iterations_seconds,
                    max_events_per_sampling_iteration,
                    relative_time_window_seconds,
                ) = self.get_sampling_entity_settings(
                    kvrecord,
                    splk_data_sampling_pct_min_major_inclusive_model_match,
                    splk_data_sampling_pct_max_exclusive_model_match,
                    splk_data_sampling_min_time_btw_iterations_seconds,
                    splk_data_sampling_no_records_per_entity,
                    splk_data_sampling_relative_time_window_seconds,
                )

                # call init function
                (
                    current_detected_format,
                    current_detected_format_dcount,
                    current_detected_format_id,
                    current_detected_major_format,
                    previous_detected_format,
                    previous_detected_format_dcount,
                    previous_detected_format_id,
                    previous_detected_major_format,
                    data_sample_anomaly_detected,
                    data_sample_anomaly_reason,
                    data_sample_feature,
                    data_sample_iteration,
                    data_sample_mtime,
                    data_sample_status_colour,
                    data_sample_status_message,
                    multiformat_detected,
                    exclusive_match_anomaly,
                ) = self.init_entity_metadata(kvrecord)

                # call get_entity_search_string
                search_string = self.get_entity_search_string(
                    entity_info,
                    object_value,
                    object_key,
                    splk_dsm_sampling_search,
                    splk_data_sampling_no_records_per_entity,
                )

                # a list to store the results
                sample_data_list = []
                sample_events_list = []

                # run search
                try:
                    # start
                    entity_search_start = time.time()

                    # get kwargs
                    kwargs_samplesearch = self.get_entity_search_kwargs(
                        object_value,
                        object_key,
                        search_string,
                        earliest_target,
                        latest_target,
                    )

                    reader = run_splunk_search(
                        self.service,
                        search_string,
                        kwargs_samplesearch,
                        24,
                        5,
                    )

                    count += 1

                    for item in reader:
                        if isinstance(item, dict):
                            logging.debug(
                                f'search_results="{json.dumps(item, indent=2)}"'
                            )

                            raw_sample = item.get("raw_sample")
                            if raw_sample:
                                raw_sample = raw_sample.rstrip(
                                    "\n"
                                )  # Removes the newline only if it's at the end
                                item["raw_sample"] = raw_sample

                            data_sourcetype = item.get("data_sourcetype")
                            # if data_sourcetype is a list, take the first element
                            if isinstance(data_sourcetype, list):
                                data_sourcetype = data_sourcetype[0]

                            data = {
                                "_time": time.time(),
                                "key": item.get("key"),
                                "object": item.get("object"),
                                "raw_sample": raw_sample,
                                "data_sourcetype": data_sourcetype,
                            }

                            # add to the list
                            sample_data_list.append(data)

                    logging.info(
                        f'tenant_id="{self.tenant_id}" search successfully executed in {round(time.time() - entity_search_start, 3)} seconds'
                    )

                except Exception as e:
                    # Call the component register
                    msg = f'tenant_id="{self.tenant_id}" search failed with exception="{str(e)}"'
                    logging.error(msg)
                    continue  # stop processing this entity

                #
                # Investigate results for this entity
                #

                # events_count
                events_count = len(sample_data_list)

                # model_split_dict
                model_split_dict = {}

                for record in sample_data_list:

                    yield_record = {}
                    raw_sample = record.get("raw_sample")
                    raw_sample_id = hashlib.sha256(
                        raw_sample.encode("utf-8")
                    ).hexdigest()
                    data_sourcetype = record.get("data_sourcetype")

                    # model_match boolean
                    model_match = False

                    # result_sampling_json_list
                    result_sampling_json_list = []

                    # loop through custom models, if any

                    #
                    # inclusive models
                    #

                    for model in merged_models_inclusive:

                        # extract
                        model_name = model.get("model_name")
                        model_regex = model.get("model_regex")
                        model_type = model.get("model_type")
                        model_id = model.get("model_id")
                        sourcetype_scope = model.get("sourcetype_scope")
                        sourcetype_scope = sourcetype_scope.split(
                            ","
                        )  # support comma separated sourcetypes

                        # init model counters
                        model_count_matched = 0

                        logging.debug(
                            f'testing inclusive_model: model_name="{model_name}", model_type="{model_type}", model_id="{model_id}", sourcetype_scope="{sourcetype_scope}"'
                        )

                        if any(
                            fnmatch.fnmatch(data_sourcetype, sourcetype.strip())
                            for sourcetype in sourcetype_scope
                        ):

                            logging.debug(
                                f"testing regex: {model_regex} against event_id: {raw_sample_id}, event: {raw_sample}"
                            )

                            if re.search(model_regex, raw_sample):

                                model_match = True
                                model_count_matched += 1

                                logging.debug(
                                    f'raw_sample_id="{raw_sample_id}", model_name="{model_name}", model_type="{model_type}", model_id="{model_id}", sourcetype_scope="{sourcetype_scope}" has a positive match with the sample event'
                                )

                                # add the model_name to current_detected_format_name, if not already in the list
                                if model_name not in current_detected_format:
                                    current_detected_format.append(model_name)

                                # add the model_id to current_detected_format_id, if not already in the list
                                if model_id not in current_detected_format_id:
                                    current_detected_format_id.append(model_id)

                                # check if the model is inclusive or exclusive
                                if model_type == "exclusive":
                                    exclusive_match_anomaly = True

                                result_sampling = {
                                    "raw_sample_id": raw_sample_id,
                                    "model_match": model_match,
                                    "model_name": model_name,
                                    "model_type": model_type,
                                    "model_id": model_id,
                                    "sourcetype_scope": sourcetype_scope,
                                    "exclusive_match_anomaly": exclusive_match_anomaly,
                                    "message": "positive match found for event",
                                }

                                # if mode is test_sampling, add the model_regex to the result_sampling
                                if self.mode == "test_sampling":
                                    result_sampling["model_regex"] = model_regex

                                result_sampling_json_list.append(result_sampling)

                                # if model has a positive match:
                                # - if not already in the model_split_dict, add it and add the model_count_matched as well as model_name and model_id
                                # - if already in the model_split_dict, increment the model_count_matched
                                if model_match:
                                    if model_id not in model_split_dict:
                                        model_split_dict[model_id] = {
                                            "model_count_matched": model_count_matched,
                                            "model_name": model_name,
                                            "model_type": model_type,
                                        }
                                    else:
                                        model_split_dict[model_id][
                                            "model_count_matched"
                                        ] += model_count_matched

                                # break at first positive match for this event
                                break

                            else:
                                logging.debug(
                                    f'model_name="{model_name}", model_type="{model_type}", model_id="{model_id}", sourcetype_scope="{sourcetype_scope}" no match found for event'
                                )

                    #
                    # exclusive models
                    #

                    for model in merged_models_exclusive:

                        # extract
                        model_name = model.get("model_name")
                        model_regex = model.get("model_regex")
                        model_type = model.get("model_type")
                        model_id = model.get("model_id")
                        sourcetype_scope = model.get("sourcetype_scope")
                        sourcetype_scope = sourcetype_scope.split(
                            ","
                        )  # support comma separated sourcetypes

                        logging.debug(
                            f'testing exclusive_model: model_name="{model_name}", model_type="{model_type}", model_id="{model_id}", sourcetype_scope="{sourcetype_scope}", model_regex="{model_regex}", raw_sample="{raw_sample}"'
                        )

                        if any(
                            fnmatch.fnmatch(data_sourcetype, sourcetype.strip())
                            for sourcetype in sourcetype_scope
                        ):

                            logging.debug(
                                f"testing regex: {model_regex} against event: {raw_sample}"
                            )

                            if re.search(model_regex, raw_sample):

                                model_match = True

                                logging.debug(
                                    f'raw_sample_id="{raw_sample_id}", model_name="{model_name}", model_type="{model_type}", model_id="{model_id}", sourcetype_scope="{sourcetype_scope}" has a positive match with the sample event'
                                )

                                # add the model_name to current_detected_format_name, if not already in the list
                                if model_name not in current_detected_format:
                                    current_detected_format.append(model_name)

                                # add the model_id to current_detected_format_id, if not already in the list
                                if model_id not in current_detected_format_id:
                                    current_detected_format_id.append(model_id)

                                # check if the model is inclusive or exclusive
                                if model_type == "exclusive":
                                    exclusive_match_anomaly = True

                                result_sampling = {
                                    "raw_sample_id": raw_sample_id,
                                    "model_match": model_match,
                                    "model_name": model_name,
                                    "model_type": model_type,
                                    "model_id": model_id,
                                    "sourcetype_scope": sourcetype_scope,
                                    "exclusive_match_anomaly": exclusive_match_anomaly,
                                    "message": "positive match found for event",
                                }

                                # if mode is test_sampling, add the model_regex to the result_sampling
                                if self.mode == "test_sampling":
                                    result_sampling["model_regex"] = model_regex

                                result_sampling_json_list.append(result_sampling)

                                # if model has a positive match:
                                # - if not already in the model_split_dict, add it and add the model_count_matched as well as model_name and model_id
                                # - if already in the model_split_dict, increment the model_count_matched
                                if model_match:
                                    if model_id not in model_split_dict:
                                        model_split_dict[model_id] = {
                                            "model_count_matched": model_count_matched,
                                            "model_name": model_name,
                                            "model_type": model_type,
                                        }
                                    else:
                                        model_split_dict[model_id][
                                            "model_count_matched"
                                        ] += model_count_matched

                                # no break for exclusive models, we need to check all of them

                            else:
                                logging.debug(
                                    f'model_name="{model_name}", model_type="{model_type}", model_id="{model_id}", sourcetype_scope="{sourcetype_scope}" no match found for event'
                                )

                    # if not match, generate a negative result
                    if model_match:
                        record["result_sampling"] = result_sampling_json_list

                    else:
                        result_sampling = {
                            "raw_sample_id": raw_sample_id,
                            "model_match": model_match,
                            "model_name": "N/A",
                            "model_type": "N/A",
                            "model_id": "N/A",
                            "sourcetype_scope": "N/A",
                            "exclusive_match_anomaly": "N/A",
                            "message": "no positive match found for event",
                        }
                        record["result_sampling"] = [result_sampling]

                    # add the event to the sample events list
                    sample_events_list_object = {
                        "event_id": raw_sample_id,
                        "model_name": current_detected_format,
                        "model_id": current_detected_format_id,
                        "result_sampling": result_sampling,
                    }
                    if data_sampling_obfuscation == 0:
                        # if the event is longer than the limit, add event_is_truncated = True, otherwwise event_is_truncated = False
                        if (
                            len(raw_sample)
                            > splk_data_sampling_records_kvrecord_truncate_size
                        ):
                            sample_events_list_object["event_is_truncated"] = True
                        else:
                            sample_events_list_object["event_is_truncated"] = False
                        sample_events_list_object["event"] = raw_sample[
                            :splk_data_sampling_records_kvrecord_truncate_size
                        ]
                        # add event_is_obfuscated = False
                        sample_events_list_object["event_is_obfuscated"] = False

                    else:
                        # add event_is_obfuscated = True
                        sample_events_list_object["event_is_obfuscated"] = True

                    sample_events_list.append(json.dumps(sample_events_list_object))

                    # yield the record
                    yield_model_match = []
                    yield_model_name = []
                    yield_model_type = []
                    yield_model_id = []
                    yield_model_regex = []
                    yield_sourcetype_scope = []
                    yield_exclusive_match_anomaly = []
                    yield_message = []

                    for k, v in record.items():
                        yield_record[k] = v

                        # get the content of result_sampling (list)
                        if k == "result_sampling":

                            logging.debug(
                                f'result_sampling="{v}", its type is {type(v)}'
                            )

                            for item in v:
                                yield_model_match.append(item.get("model_match"))
                                yield_model_name.append(item.get("model_name"))
                                yield_model_type.append(item.get("model_type"))
                                yield_model_id.append(item.get("model_id"))
                                # if mode is test_sampling, add the model_regex to the yield
                                if self.mode == "test_sampling":
                                    yield_model_regex.append(item.get("model_regex"))
                                yield_sourcetype_scope.append(
                                    item.get("sourcetype_scope")
                                )
                                yield_exclusive_match_anomaly.append(
                                    item.get("exclusive_match_anomaly")
                                )
                                yield_message.append(item.get("message"))
                            # now add our list to yield_record
                            yield_record["model_match"] = yield_model_match
                            yield_record["model_name"] = yield_model_name
                            yield_record["model_type"] = yield_model_type
                            yield_record["model_id"] = yield_model_id
                            # if mode is test_sampling, add the model_regex to the yield
                            if self.mode == "test_sampling":
                                yield_record["model_regex"] = yield_model_regex

                            yield_record["sourcetype_scope"] = yield_sourcetype_scope
                            yield_record["exclusive_match_anomaly"] = (
                                yield_exclusive_match_anomaly
                            )
                            yield_record["message"] = yield_message

                    # add the _raw
                    yield_record["_raw"] = json.dumps(record)

                    # finally yield the record expect in run_sampling mode to reduce processing costs
                    if self.mode != "run_sampling":
                        yield yield_record

                #
                # investigate results
                #

                if len(current_detected_format) > 1:
                    multiformat_detected = True

                # model_split_dict:
                # for each model matched in model_split_dict, calculate the percentage of the model match and add to the dict
                max_model_pct_match = 0  # Track the highest percentage of matches
                major_model_id = None  # Track the model ID with the highest matches
                major_model_name = None  # Track the model name with the highest matches

                for model_id, model_dict in model_split_dict.items():
                    model_count_matched = model_dict.get("model_count_matched")
                    model_name = model_dict.get("model_name")
                    model_type = model_dict.get("model_type")
                    model_pct_match = round(
                        (model_count_matched / events_count) * 100, 2
                    )
                    model_split_dict[model_id]["model_pct_match"] = model_pct_match
                    # add the total events as modeL_count_parsed
                    model_split_dict[model_id]["model_count_parsed"] = events_count

                    # Determine if this model is the major model
                    if model_pct_match > max_model_pct_match:
                        max_model_pct_match = model_pct_match
                        major_model_id = model_id
                        major_model_name = model_name

                # Now, mark the major model and others
                for model_id in model_split_dict:
                    if model_id == major_model_id:
                        model_split_dict[model_id]["model_is_major"] = True
                    else:
                        model_split_dict[model_id]["model_is_major"] = False

                # set the current major detected format
                current_detected_major_format = major_model_name

                # List of fields to be managed in the sampling record:
                # object
                # raw_sample
                # data_sample_mtime: epochtime
                # data_sample_last_entity_epoch_processed: epochtime
                # data_sample_feature: string, enabled | disabled | disabled_auto
                # data_sample_iteration: integer
                # data_sample_anomaly_reason: string
                # data_sample_status_colour: string
                # data_sample_anomaly_detected: boolean
                # data_sample_status_message: dict
                # multiformat_detected: boolean
                # current_detected_format: list
                # current_detected_format_id: list
                # current_detected_format_dcount: integer
                # current_detected_major_format: string
                # previous_detected_format: list
                # previous_detected_format_id: list
                # previous_detected_format_dcount: integer
                # previous_detected_major_format: string
                # exclusive_match_anomaly: boolean
                # raw_sample: list

                # uc: exclusive match anomaly detected:

                # uc: exclusive match anomaly detected:
                # - if in the list of matched models in model_split_dict, an exclusive model is detected and its percentage is higher than the max allowed, set True

                #
                # exclusive match anomaly detected:
                #

                exclusive_match_anomaly = False

                for model_id, model_dict in model_split_dict.items():
                    model_pct_match = model_dict.get("model_pct_match")
                    model_type = model_dict.get("model_type")
                    if (
                        model_type == "exclusive"
                        and model_pct_match > pct_max_exclusive_model_match
                    ):
                        exclusive_match_anomaly = True

                #
                # inclusive match anomaly detected:
                #

                inclusive_match_anomaly = False

                for model_id, model_dict in model_split_dict.items():
                    model_pct_match = model_dict.get("model_pct_match")
                    model_type = model_dict.get("model_type")
                    model_is_major = model_dict.get("model_is_major")
                    if (
                        model_type == "inclusive"
                        and model_is_major
                        and model_pct_match < pct_min_major_inclusive_model_match
                    ):
                        inclusive_match_anomaly = True

                # create a model_summary_list based on the model_split_dict:
                # for each model in model_split_dict, add a record with:
                # model_name | pct_match: percentage of match | model_type: inclusive or exclusive | model_is_major: boolean
                model_summary_list = []
                for model_id, model_dict in model_split_dict.items():
                    model_summary_record = f'{model_dict.get("model_name")} | pct_match: {model_dict.get("model_pct_match")} | type: {model_dict.get("model_type")}'
                    model_summary_list.append(model_summary_record)

                #
                # define the status of the feature
                #

                if exclusive_match_anomaly:
                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "exclusive_rule_match"
                    data_sample_feature = "enabled"
                    data_sample_anomaly_detected = 1
                    data_sample_status_colour = "red"
                    data_sample_status_message = {
                        "state": "red",
                        "desc": "Anomalies detected, one or more exclusive rules have been matched.",
                        "remediation": "Exclusive matches mean that regular expressions have matched forbidden content in one or more events, review the latest sample events to identify the root cause. Once the issue is fixed, click on clear state & run sampling.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # inclusive match anomaly detected at the time of the discovery with multiple formats detected:
                # - Disable the feature to avoid generating false positive, in the sense that most likely this feed
                # is not a good candidate for data sampling
                # - However, we still want to attempt processing this feed in the case of a change in conditions, but keep disabled_auto
                # so we do not influence the entity status

                elif (
                    inclusive_match_anomaly
                    and data_sample_iteration == 1
                    and multiformat_detected
                ):
                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "anomalies_at_discovery"
                    data_sample_feature = "disabled_auto"
                    data_sample_anomaly_detected = 2
                    data_sample_status_colour = "orange"
                    data_sample_status_message = {
                        "state": "orange",
                        "desc": "Anomalies were detected since the entity discovery, multiple formats were detected and the major model is under the acceptable threshold of percentage of events matched by the major model. The data sampling feature was automatically disabled (disabled_auto) to avoid generating false positive for this entity (the feature will not be allowed to influence the entity status), however TrackMe will continue attempting to process in case conditions for this feed change.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # inclusive match anomaly detected at the time of the discovery and next iterations
                # - Disable the feature to avoid generating false positive, in the sense that most likely this feed
                # is not a good candidate for data sampling
                # - However, we still want to attempt processing this feed in the case of a change in conditions, but keep disabled_auto
                # so we do not influence the entity status

                elif (
                    inclusive_match_anomaly
                    and data_sample_iteration > 1
                    and multiformat_detected
                    and data_sample_feature == "disabled_auto"
                ):
                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "anomalies_since_discovery"
                    data_sample_feature = "disabled_auto"
                    data_sample_anomaly_detected = 2
                    data_sample_status_colour = "orange"
                    data_sample_status_message = {
                        "state": "orange",
                        "desc": "Anomalies were detected since the entity discovery, multiple formats were detected and the major model is under the acceptable threshold of percentage of events matched by the major model. The data sampling feature was automatically disabled (disabled_auto) to avoid generating false positive for this entity (the feature will not be allowed to influence the entity status), however TrackMe will continue attempting to process in case conditions for this feed change.",
                        "remediation": "Review events generated for this feed, when TrackMe first discover the entity and finds multiple format that would generate an inclusive anomaly (the percentage of events for the major format goes bellow the minimal acceptable percentage of events), the feature is automatically disabled. The issue can be addressed by the creation of a custom model that is more adapted to the feed context, or may need to remaind disable if the feed is not a right candidate for the sampling feature, such as a sourcetype with poor event quality, or a sourcetype where many various events formats are expected and accepted.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # inclusive match anomaly after discovery and enablement
                elif inclusive_match_anomaly:
                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "inclusive_rule_match"
                    data_sample_feature = "enabled"
                    data_sample_anomaly_detected = 1
                    data_sample_status_colour = "red"
                    data_sample_status_message = {
                        "state": "red",
                        "desc": "Anomalies detected, quality issues were detected, the min percentage of the major model matched does not meet requirements which indicates that a too large number of events do not share the same format that than the majority of events.",
                        "remediation": "Inclusive matches mean that regular expressions have not matched the expected content in one or more events, review the latest sample events to identify the root cause. Once the issue is fixed, click on clear state & run sampling.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # uc: major format has changed
                elif (
                    data_sample_iteration > 1
                    and current_detected_major_format
                    and previous_detected_major_format
                    and current_detected_major_format != previous_detected_major_format
                    and previous_detected_major_format != "raw_not_identified"
                ):

                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "format_change"
                    data_sample_feature = "enabled"
                    data_sample_anomaly_detected = 1
                    data_sample_status_colour = "red"
                    data_sample_status_message = {
                        "state": "red",
                        "desc": f"The major event format (the format previously detected for the majority of events) has changed from {previous_detected_major_format} to {current_detected_major_format}, this might indicate a non expected quality issue or condition change in the ingest of this feed in Splunk.",
                        "remediation": "Review the latest sample events to identify the root cause. Once the issue is fixed, click on clear state & run sampling.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # no format detected, do not raise an alert
                elif current_detected_major_format == "raw_not_identified":
                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "no_anomalies_detected"
                    data_sample_feature = "enabled"
                    data_sample_anomaly_detected = 2
                    data_sample_status_colour = "orange"
                    data_sample_status_message = {
                        "state": "orange",
                        "desc": "No events format were detected for this entity. (raw_not_identified)",
                        "remediation": "Review events in this feed, you can address this condition by creating a custom model for these events, you can set the sourcetype scope to be matching especially this entity sourcetype or set the sourcetype scope to be eligible for other feeds too.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # else, we have no anomalies detected
                else:
                    data_sample_epoch = time.time()
                    data_sample_model_matched_summary = model_split_dict
                    data_sample_anomaly_reason = "no_anomalies_detected"
                    data_sample_feature = "enabled"
                    data_sample_anomaly_detected = 0
                    data_sample_status_colour = "green"
                    data_sample_status_message = {
                        "state": "green",
                        "desc": "No anomalies were detected during the last data sampling iteration.",
                        "remediation": "N/A.",
                        "last_run": f"{convert_epoch_to_datetime(data_sample_epoch)}",
                        "anomaly_reason": data_sample_anomaly_reason,
                        "multiformat": multiformat_detected,
                        "events_count": events_count,
                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                        "relative_time_window_seconds": relative_time_window_seconds,
                        "current_detected_major_format": current_detected_major_format,
                        "models_summary": model_summary_list,
                    }

                # log results
                logging.info(
                    f'tenant_id={self.tenant_id}, Data sampling terminated, object="{object_value}", key="{object_key}", events_count="{events_count}", current_detected_format="{current_detected_format}", data_sample_epoch="{data_sample_epoch}", data_sample_model_matched_summary="{json.dumps(model_split_dict, indent=2)}", data_sample_feature="{data_sample_feature}", data_sample_iteration="{data_sample_iteration}", data_sample_anomaly_reason="{data_sample_anomaly_reason}", data_sample_status_colour="{data_sample_status_colour}", data_sample_anomaly_detected="{data_sample_anomaly_detected}", data_sample_status_message="{json.dumps(data_sample_status_message, indent=2)}", multiformat_detected="{multiformat_detected}", current_detected_format="{current_detected_format}", current_detected_format_id="{current_detected_format_id}", current_detected_format_dcount="{len(current_detected_format)}", previous_detected_format="{previous_detected_format}", previous_detected_format_id="{previous_detected_format_id}", previous_detected_format_dcount="{previous_detected_format_dcount}", exclusive_match_anomaly="{exclusive_match_anomaly}"'
                )

                # insert or update the KVstore record (list of fields in List of fields to be managed in the sampling record)
                if self.mode == "run_sampling":

                    #
                    # restrict samples stored in the KVstore to x events per model match according to system wide configuration
                    #

                    # Group sample events by model match
                    events_by_model = {}

                    for event in sample_events_list:
                        event_data = json.loads(event)
                        model_id = tuple(
                            event_data.get("model_id")
                        )  # Convert model_id list to a tuple

                        # Initialize the list for this model_id if it doesn't exist
                        if model_id not in events_by_model:
                            events_by_model[model_id] = []

                        # Append the event to the corresponding model_id list
                        events_by_model[model_id].append(event_data)

                    # Limit to x events per model match
                    limited_sample_events_list = []
                    for model_id, events in events_by_model.items():
                        # Take events for each model match according to system wide configuration
                        limited_events = events[
                            :splk_data_sampling_no_records_saved_kvrecord
                        ]
                        limited_sample_events_list.extend(limited_events)

                    # Serialize the limited sample events list, considering obfuscation
                    serialized_sample_events_list = []
                    for event in limited_sample_events_list:
                        if data_sampling_obfuscation == 0:
                            # Include raw event data if obfuscation is not enabled
                            serialized_sample_events_list.append(json.dumps(event))
                        else:
                            # Exclude the raw event data if obfuscation is enabled
                            event.pop("event", None)  # Remove the raw event data
                            serialized_sample_events_list.append(json.dumps(event))

                    #
                    # KVstore record update/insert
                    #

                    kvrecord_updated = False

                    try:

                        if not key:

                            # insert
                            collection.data.insert(
                                json.dumps(
                                    {
                                        "_key": object_key,
                                        "object": object_value,
                                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                                        "relative_time_window_seconds": relative_time_window_seconds,
                                        "events_count": events_count,
                                        "data_sample_mtime": data_sample_epoch,
                                        "data_sample_last_entity_epoch_processed": data_last_time_seen,
                                        "data_sample_model_matched_summary": model_split_dict,
                                        "data_sample_feature": data_sample_feature,
                                        "data_sample_iteration": data_sample_iteration,
                                        "data_sample_anomaly_reason": data_sample_anomaly_reason,
                                        "data_sample_status_colour": data_sample_status_colour,
                                        "data_sample_anomaly_detected": data_sample_anomaly_detected,
                                        "data_sample_status_message": data_sample_status_message,
                                        "multiformat_detected": multiformat_detected,
                                        "current_detected_format": current_detected_format,
                                        "current_detected_format_id": current_detected_format_id,
                                        "current_detected_format_dcount": len(
                                            current_detected_format
                                        ),
                                        "current_detected_major_format": current_detected_major_format,
                                        "previous_detected_format": previous_detected_format,
                                        "previous_detected_format_id": previous_detected_format_id,
                                        "previous_detected_format_dcount": previous_detected_format_dcount,
                                        "previous_detected_major_format": previous_detected_major_format,
                                        "exclusive_match_anomaly": exclusive_match_anomaly,
                                        "raw_sample": serialized_sample_events_list,
                                    }
                                )
                            )
                            kvrecord_updated = True

                        else:  # update

                            collection.data.update(
                                key,
                                json.dumps(
                                    {
                                        "_key": key,
                                        "object": object_value,
                                        "min_time_btw_iterations_seconds": min_time_btw_iterations_seconds,
                                        "pct_min_major_inclusive_model_match": pct_min_major_inclusive_model_match,
                                        "pct_max_exclusive_model_match": pct_max_exclusive_model_match,
                                        "max_events_per_sampling_iteration": max_events_per_sampling_iteration,
                                        "relative_time_window_seconds": relative_time_window_seconds,
                                        "events_count": events_count,
                                        "data_sample_mtime": data_sample_epoch,
                                        "data_sample_last_entity_epoch_processed": data_last_time_seen,
                                        "data_sample_model_matched_summary": model_split_dict,
                                        "data_sample_feature": data_sample_feature,
                                        "data_sample_iteration": data_sample_iteration,
                                        "data_sample_anomaly_reason": data_sample_anomaly_reason,
                                        "data_sample_status_colour": data_sample_status_colour,
                                        "data_sample_anomaly_detected": data_sample_anomaly_detected,
                                        "data_sample_status_message": data_sample_status_message,
                                        "multiformat_detected": multiformat_detected,
                                        "current_detected_format": current_detected_format,
                                        "current_detected_format_id": current_detected_format_id,
                                        "current_detected_format_dcount": len(
                                            current_detected_format
                                        ),
                                        "current_detected_major_format": current_detected_major_format,
                                        "previous_detected_format": previous_detected_format,
                                        "previous_detected_format_id": previous_detected_format_id,
                                        "previous_detected_format_dcount": previous_detected_format_dcount,
                                        "previous_detected_major_format": previous_detected_major_format,
                                        "exclusive_match_anomaly": exclusive_match_anomaly,
                                        "raw_sample": serialized_sample_events_list,
                                    }
                                ),
                            )

                            kvrecord_updated = True

                            logging.info(
                                f'tenant_id="{self.tenant_id}", component="splk-dsm", successfully updated the KVstore record'
                            )

                    except Exception as e:
                        logging.error(
                            f'tenant_id="{self.tenant_id}", component="splk-dsm", failed to insert or update the KVstore record, exception="{str(e)}"'
                        )

                    # yield a simple summary record
                    yield_record = {
                        "_time": time.time(),
                        "object": object_value,
                        "events_count": events_count,
                        "data_sample_status_message": data_sample_status_message,
                        "data_sample_model_matched_summary": model_split_dict,
                        "kvrecord_updated": kvrecord_updated,
                        "run_time": round(time.time() - iteration_start_time, 2),
                    }
                    yield_record["_raw"] = json.dumps(yield_record)
                    yield yield_record

                # gen models metrics
                try:
                    trackme_splk_dsm_data_sampling_gen_metrics(
                        self.tenant_id,
                        metric_index,
                        object_value,
                        object_key,
                        model_split_dict,
                    )
                except Exception as e:
                    error_msg = f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{key}", failed to stream events to metrics with exception="{str(e)}"'
                    logging.error(error_msg)

                # gen metrics
                entity_total_elapsed_time = time.time() - entity_search_start
                try:
                    trackme_splk_dsm_data_sampling_total_run_time_gen_metrics(
                        self.tenant_id,
                        metric_index,
                        object_value,
                        object_key,
                        entity_total_elapsed_time,
                        events_count,
                    )
                except Exception as e:
                    error_msg = f'tenant_id="{self.tenant_id}", object="{object_value}", object_id="{key}", failed to stream events to metrics with exception="{str(e)}"'
                    logging.error(error_msg)

                # notification event
                try:
                    trackme_handler_events(
                        session_key=self._metadata.searchinfo.session_key,
                        splunkd_uri=self._metadata.searchinfo.splunkd_uri,
                        tenant_id=self.tenant_id,
                        sourcetype="trackme:handler",
                        source=f"trackme:handler:{self.tenant_id}",
                        handler_events=[
                            {
                                "object": object_value,
                                "object_id": object_key,
                                "object_category": f"splk-dsm",
                                "handler": f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}",
                                "handler_message": "Data sampling was performed for this entity.",
                                "handler_troubleshoot_search": f'index=_internal sourcetype=trackme:custom_commands:trackmesamplingexecutortenant_id={self.tenant_id} object="{object_value}"',
                                "handler_time": time.time(),
                            }
                        ],
                    )
                except Exception as e:
                    logging.error(
                        f'tenant_id="{self.tenant_id}", component=f"splk-dsm", could not send notification event, exception="{e}"'
                    )

                # Calculate the execution time for this iteration
                iteration_end_time = time.time()
                execution_time = iteration_end_time - iteration_start_time

                # Update total execution time and iteration count
                total_execution_time += execution_time
                iteration_count += 1

                # Calculate average execution time
                if iteration_count > 0:
                    average_execution_time = total_execution_time / iteration_count
                else:
                    average_execution_time = 0

                # Check if there is enough time left to continue
                elapsed_time = time.time() - start

                if elapsed_time + average_execution_time + 120 >= max_runtime:
                    logging.info(
                        f'tenant_id="{self.tenant_id}", component="splk-dsm", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
                    )
                    break

            # end of the main loop
            if entities_count == 0:

                # yield a simple summary record
                yield_record = {
                    "_time": time.time(),
                    "result": "There are no entities to process at this time.",
                    "search": upstream_search_string,
                }
                yield_record["_raw"] = json.dumps(yield_record)
                yield yield_record

            # end
            logging.info(
                f'tenant_id="{self.tenant_id}" data sampling job successfully executed, status="success", run_time="{round(time.time() - start, 3)}", report_name="{str(report_name)}", entities_count="{str(count)}"'
            )

            # Call the component register
            if self.mode == "run_sampling":
                trackme_register_tenant_object_summary(
                    session_key,
                    self._metadata.searchinfo.splunkd_uri,
                    self.tenant_id,
                    "splk-dsm",
                    f"trackme_dsm_data_sampling_tracker_tenant_{self.tenant_id}",
                    "success",
                    time.time(),
                    str(time.time() - start),
                    "The report was executed successfully",
                    "-24h",
                    "now",
                )


dispatch(DataSamplingExecutor, sys.argv, sys.stdin, sys.stdout, __name__)