Splunk_Deploiement/apps/trackme/bin/trackmesplkwlkgetreportsdef...

#!/usr/bin/env python
# coding=utf-8

__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = ["Guilhem Marchand"]
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"

# Standard library
import os
import sys
import time
import json
import random
import difflib
import hashlib
import fnmatch
from datetime import datetime

# External libraries
import requests
from requests.structures import CaseInsensitiveDict
import urllib3
import urllib.parse

# Disable urllib3 warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Configure logging
import logging
from logging.handlers import RotatingFileHandler

# set splunkhome
splunkhome = os.environ["SPLUNK_HOME"]

# set logging
filehandler = RotatingFileHandler(
    "%s/var/log/splunk/trackme_splkwlk_getreportsdef_stream.log" % splunkhome,
    mode="a",
    maxBytes=10000000,
    backupCount=1,
)
formatter = logging.Formatter(
    "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger()  # root logger - Good to get it only once.
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr, logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)  # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)

# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# import libs
import import_declare_test

# import Splunk libs (after lib appended)
from splunklib.searchcommands import (
    dispatch,
    StreamingCommand,
    Configuration,
    Option,
    validators,
)
import splunklib.client as client

# import trackme libs (after lib appended)
from trackme_libs import (
    trackme_reqinfo,
    trackme_register_tenant_object_summary,
    run_splunk_search,
    trackme_handler_events,
)
from trackme_libs_splk_wlk import trackme_ingest_version
from trackme_libs_utils import decode_unicode, remove_leading_spaces

# import trackme libs croniter
from trackme_libs_croniter import cron_to_seconds


@Configuration(distributed=False)
class SplkWlkGetReportsDef(StreamingCommand):
    tenant_id = Option(
        doc="""
        **Syntax:** **tenant_id=****
        **Description:** The tenant identifier.""",
        require=True,
        default=None,
    )

    context = Option(
        doc="""
        **Syntax:** **context=****
        **Description:** The context is used for simulation purposes, defaults to live.""",
        require=False,
        default="live",
        validate=validators.Match("context", r"^(live|simulation)$"),
    )

    check_orphan = Option(
        doc="""
        **Syntax:** **check_orphan=****
        **Description:** Is enabled, check for orphan status.""",
        require=False,
        default=False,
        validate=validators.Boolean(),
    )

    register_component = Option(
        doc="""
        **Syntax:** **register_component=****
        **Description:** If the search is invoked by a tracker, register_component can be called to capture and regoster any execution exception.""",
        require=False,
        default=False,
    )

    report = Option(
        doc="""
        **Syntax:** **report=****
        **Description:** If register_component is set, a value for report is required.""",
        require=False,
        default=None,
        validate=validators.Match("report", r"^.*$"),
    )

    exclude_apps = Option(
        doc="""
        **Syntax:** **exlude_apps=****
        **Description:** A comma seprated list of apps we are never going to consider.""",
        require=False,
        default="skynet-rest,cloud-monitoring-console-summarizer",
        validate=validators.Match("exclude_apps", r"^.*$"),
    )

    max_runtime_sec = Option(
        doc="""
        **Syntax:** **max_runtime_sec=****
        **Description:** The max runtime for the job in seconds, defaults to 15 minutes less 120 seconds of margin.""",
        require=False,
        default="900",
        validate=validators.Match("max_runtime_sec", r"^\d*$"),
    )

    filters_get_last_updates = Option(
        doc="""
        **Syntax:** **filters_get_last_updates=****
        **Description:** An optional search string to restrict the Search Head tiers when looking at the last updates of savedsearches (to identify who modified a search and when), defaults to host=*.""",
        require=False,
        default="host=*",
        validate=validators.Match("filters_get_last_updates", r"^.*$"),
    )

    def generate_diff_string(self, a, b):
        # Handle None values gracefully
        if a is None:
            a = ""
        if b is None:
            b = ""

        # Convert to strings if they aren't already
        a_str = str(a) if a is not None else ""
        b_str = str(b) if b is not None else ""

        a_lines = a_str.splitlines(keepends=True)
        b_lines = b_str.splitlines(keepends=True)
        diff = difflib.unified_diff(
            a_lines, b_lines, fromfile="last_known", tofile="current", lineterm=""
        )
        return "".join(diff)

    def is_reachable(self, session, url, timeout):
        try:
            session.get(url, timeout=timeout, verify=False)
            return True, None
        except Exception as e:
            return False, str(e)

    def select_url(self, session, splunk_url):
        splunk_urls = splunk_url.split(",")
        unreachable_errors = []

        reachable_urls = []
        for url in splunk_urls:
            reachable, error = self.is_reachable(session, url, 10)
            if reachable:
                reachable_urls.append(url)
            else:
                unreachable_errors.append((url, error))

        selected_url = random.choice(reachable_urls) if reachable_urls else False
        return selected_url, unreachable_errors

    def log_and_register_failure(self, error_msg, session_key, start, earliest, latest):
        logging.error(error_msg)

        if self.register_component and self.tenant_id and self.report:
            try:
                trackme_register_tenant_object_summary(
                    session_key,
                    self._metadata.searchinfo.splunkd_uri,
                    self.tenant_id,
                    "splk-wlk",
                    self.report,
                    "failure",
                    time.time(),
                    round(time.time() - start, 3),
                    error_msg,
                    earliest,
                    latest,
                )
            except Exception as e:
                logging.error(
                    f'tenant_id="{self.tenant_id}", component="splk-wlk", Failed to call trackme_register_tenant_object_summary with exception="{str(e)}"'
                )
        elif self.register_component:
            logging.error(
                "If register_component is set, then tenant_id, report, and component must be set too."
            )

        raise Exception(error_msg)

    # get account creds with least privilege approach
    def get_account(self, session_key, splunkd_uri, account):
        """
        Retrieve account creds.
        """

        # Ensure splunkd_uri starts with "https://"
        if not splunkd_uri.startswith("https://"):
            splunkd_uri = f"https://{splunkd_uri}"

        # Build header and target URL
        headers = CaseInsensitiveDict()
        headers["Authorization"] = f"Splunk {session_key}"
        headers["Content-Type"] = "application/json"
        target_url = (
            f"{splunkd_uri}/services/trackme/v2/configuration/get_remote_account"
        )

        # Create a requests session for better performance
        session = requests.Session()
        session.headers.update(headers)

        try:
            # Use a context manager to handle the request
            with session.post(
                target_url, data=json.dumps({"account": account}), verify=False
            ) as response:
                if response.ok:
                    response_json = response.json()
                    return response_json
                else:
                    error_message = f'Failed to retrieve account, status_code={response.status_code}, response_text="{response.text}"'
                    logging.error(error_message)
                    raise Exception(error_message)

        except Exception as e:
            error_message = f'Failed to retrieve account, exception="{str(e)}"'
            logging.error(error_message)
            raise Exception(error_message)

    # get the list of all accounts with least privileges approach
    def list_accounts(self, session_key, splunkd_uri):
        """
        List all accounts.
        """

        # Ensure splunkd_uri starts with "https://"
        if not splunkd_uri.startswith("https://"):
            splunkd_uri = f"https://{splunkd_uri}"

        # Build header and target URL
        headers = CaseInsensitiveDict()
        headers["Authorization"] = f"Splunk {session_key}"
        headers["Content-Type"] = "application/json"
        target_url = f"{splunkd_uri}/services/trackme/v2/configuration/list_accounts"

        # Create a requests session for better performance
        session = requests.Session()
        session.headers.update(headers)

        try:
            # Use a context manager to handle the request
            with session.get(target_url, verify=False) as response:
                if response.ok:
                    logging.debug(
                        f'Success retrieving list of accounts, data="{response.json()}", response_text="{response.text}"'
                    )
                    response_json = response.json()
                    return response_json
                else:
                    error_message = f'Failed to retrieve accounts, status_code={response.status_code}, response_text="{response.text}"'
                    logging.error(error_message)
                    raise Exception(error_message)

        except Exception as e:
            error_message = f'Failed to retrieve account, exception="{str(e)}"'
            logging.error(error_message)
            raise Exception(error_message)

    # get a targeted KVrecord
    def get_kv_record(self, versioning_collection, record_object_id):
        try:
            query_string = {
                "_key": record_object_id,
            }
            kvrecord = versioning_collection.data.query(query=json.dumps(query_string))[
                0
            ]
            kvrecordkey = kvrecord.get("_key")
            kvrecorddict = json.loads(kvrecord.get("version_dict"))
        except Exception as e:
            kvrecordkey = None
            kvrecord = None
            kvrecorddict = None

        return kvrecord, kvrecordkey, kvrecorddict

    # sort the JSON dict by the most recent epoch
    def sort_json_by_epoch(self, json_dict: dict) -> dict:
        # Sort the dictionary by the "time_inspected_epoch" value in descending order
        sorted_json_dict = {
            k: v
            for k, v in sorted(
                json_dict.items(),
                key=lambda item: item[1]["time_inspected_epoch"],
                reverse=True,
            )
        }

        # Return the sorted dictionary
        return sorted_json_dict

    # establish remote connectivity
    def establish_remote_service(
        self, splunk_url, bearer_token, connect_user, record_app, account
    ):
        # use urlparse to extract relevant info from target
        parsed_url = urllib.parse.urlparse(splunk_url)

        # Establish the remote service
        logging.debug(
            f'Establishing connection to host="{parsed_url.hostname}" on port="{parsed_url.port}"'
        )

        # boolean for service connection check
        remote_service_established = False
        service = None
        header = None

        try:
            service = client.connect(
                host=parsed_url.hostname,
                splunkToken=str(bearer_token),
                owner=connect_user,
                app=record_app,
                port=parsed_url.port,
                autologin=True,
                timeout=600,
            )

            # get the list of remote apps to test the connectivity effectively
            remote_apps = [app.label for app in service.apps]
            if remote_apps:
                logging.debug(
                    f'remote search connectivity check to host="{parsed_url.hostname}" on port="{parsed_url.port}" was successful'
                )
                remote_service_established = True

                # set header
                header = {
                    "Authorization": "Bearer %s" % bearer_token,
                    "Content-Type": "application/json",
                }

            else:
                remote_service_established = False
                service = False
                error_msg = f'remote search for account="{account}" has failed at connectivity check, in some use cases this may be expected, host="{parsed_url.hostname}" on port="{parsed_url.port}", connect_user="{connect_user}", connect_app="{record_app}", no remote apps found'
                logging.error(error_msg)

        except Exception as e:
            remote_service_established = False
            service = False
            error_msg = f'remote search for account="{account}" has failed at connectivity check, in some use cases this may be expected, host="{parsed_url.hostname}" on port="{parsed_url.port}", connect_user="{connect_user}", connect_app="{record_app}", exception="{str(e)}"'
            logging.warning(error_msg)

        return remote_service_established, service, header

    # establish local connectivity
    def establish_local_service(self, session_key, connect_user, record_app):
        # set target
        selected_url = self._metadata.searchinfo.splunkd_uri
        parsed_url = urllib.parse.urlparse(selected_url)

        try:
            # explicit service
            service = client.connect(
                token=str(session_key),
                owner=connect_user,
                app=record_app,
                host=parsed_url.hostname,
                port=parsed_url.port,
                timeout=600,
            )

            remote_apps = [app.label for app in service.apps]
            if not remote_apps:
                service = False

        except Exception as e:
            service = False

        # set header
        header = {
            "Authorization": "Splunk %s" % session_key,
            "Content-Type": "application/json",
        }

        return selected_url, service, header

    # default record
    def yield_default_record(
        self,
        tenant_id,
        record_object,
        record_object_id,
        account,
        record_app,
        record_user,
        record_savedsearch_name,
        message,
    ):
        record = {
            "_time": time.time(),
            "tenant_id": tenant_id,
            "object": record_object,
            "object_id": record_object_id,
            "account": account,
            "app": record_app,
            "user": record_user,
            "savedsearch_name": record_savedsearch_name,
            "search": "None",
            "earliest_time": "None",
            "latest_time": "None",
            "cron_schedule": "None",
            "cron_exec_sequence_sec": "None",
            "description": "None",
            "disabled": "None",
            "is_scheduled": "None",
            "schedule_window": "None",
            "workload_pool": "None",
            "owner": "None",
            "sharing": "None",
            "metrics": "None",
            "json_data": "None",
            "version_id": "None",
            "message": message,
        }
        return record

    # ingest version
    def ingest_version(
        self, object_value, splunk_index, splunk_sourcetype, splunk_source, json_data
    ):
        # add for the indexing purposes
        new_event_json = {}
        new_event_json["tenant_id"] = self.tenant_id
        new_event_json["object"] = object_value
        new_event_json["object_category"] = "splk-wlk"

        for key, value in json_data.items():
            new_event_json[key] = value

        # add the event_id
        new_event_json["event_id"] = hashlib.sha256(
            json.dumps(json_data).encode()
        ).hexdigest()

        # Index the version
        try:
            trackme_ingest_version(
                index=splunk_index,
                sourcetype=splunk_sourcetype,
                source=splunk_source,
                event=json.dumps(new_event_json),
            )
            logging.debug(
                f'TrackMe version event created successfully, record="{json.dumps(json_data, indent=1)}"'
            )
        except Exception as e:
            logging.error(
                f'TrackMe version event creation failure, record="{json.dumps(json_data, indent=1)}", exception="{str(e)}"'
            )

    # get last updates table
    def get_last_updates(
        self,
        session_key,
        server_rest_uri,
        account,
    ):
        if account != "local":
            # get account
            account_dict = self.get_account(session_key, server_rest_uri, account)
            splunk_url = account_dict["splunk_url"]
            bearer_token = account_dict["token"]

            # Create a session within the generate function
            session = requests.Session()

            # Call target selector and pass the session as an argument
            selected_url, errors = self.select_url(session, splunk_url)

            # end of get configuration

            # If none of the endpoints could be reached
            if not selected_url:
                error_msg = "None of the endpoints provided in the account URLs could be reached successfully, verify your network connectivity!"
                error_msg += "Errors: " + ", ".join(
                    [f"{url}: {error}" for url, error in errors]
                )
                logging.error(error_msg)
                remote_service_established = None

            else:
                # Enforce https and remove trailing slash in the URL, if any
                selected_url = (
                    f"https://{selected_url.replace('https://', '').rstrip('/')}"
                )

                # use urlparse to extract relevant info from target
                parsed_url = urllib.parse.urlparse(selected_url)

                # Establish the remote service
                logging.debug(
                    f'Establishing connection to host="{parsed_url.hostname}" on port="{parsed_url.port}"'
                )

                # establish connectivity
                (
                    remote_service_established,
                    service,
                    header,
                ) = self.establish_remote_service(
                    selected_url,
                    bearer_token,
                    "nobody",
                    "search",
                    account,
                )

        else:
            # local connectivity
            service = self.service

        # Start logic
        if account == "local" or remote_service_established:
            # run a Splunk search against the target and store as a dict per savedsearch_name, containing the last known update epochtime and the user who updated it

            # kwargs
            kwargs_oneshot = {
                "earliest_time": "-60m",
                "latest_time": "now",
                "output_mode": "json",
                "count": 0,
            }

            search_string = f"""\
                search index=_internal sourcetype=splunkd_ui_access splunkd servicesNS "saved/searches" method=POST {self.filters_get_last_updates}
                | regex uri="/[^/]*/splunkd/__raw/servicesNS/[^/]*/[^/]*/saved/searches/[^/ ]*$"
                | rex field=uri "/[^/]*/splunkd/__raw/servicesNS/[^/]*/[^/]*/saved/searches/(?<search_encoded>[^/\\? ]*)"
                | eval savedsearch_name=urldecode(search_encoded)
                | rename user as user
                | fields _time savedsearch_name user
                | stats latest(user) as user, max(_time) as time by savedsearch_name
                | sort 0 savedsearch_name
            """

            start_time = time.time()

            # last_updates dict
            last_updates_dict = {}

            # run search
            try:
                reader = run_splunk_search(
                    service,
                    remove_leading_spaces(search_string),
                    kwargs_oneshot,
                    24,
                    5,
                )

                for item in reader:
                    if isinstance(item, dict):
                        last_updates_dict[item["savedsearch_name"]] = {
                            "user": item["user"],
                            "time": item["time"],
                        }

                # break while
                logging.debug(
                    f'tenant_id="{self.tenant_id}", get_last_updates successfully completed in {round(time.time() - start_time, 2)} seconds, {len(last_updates_dict)} results were returned.'
                )

            except Exception as e:
                msg = f'tenant_id="{self.tenant_id}", main search failed with exception="{str(e)}"'
                logging.error(msg)
                raise Exception(msg)

            # return the last_updates_dict
            return last_updates_dict

    # process savedsearch
    def process_savedsearch(
        self,
        session,
        record,
        kvrecordkey,
        kvrecorddict,
        local_splunkd_port,
        session_key,
        server_rest_uri,
        splunk_index,
        splunk_sourcetype,
        splunk_source,
        last_updates_dict,
        splk_general_workload_version_id_keys,
    ):
        tenant_id = record.get("tenant_id")
        account = record.get("account")
        record_app = record.get("app")
        record_user = record.get("user")
        record_savedsearch_name = decode_unicode(record.get("savedsearch_name"))
        record_object = record.get("object")
        record_object_id = record.get("object_id")
        record_metrics = json.loads(record.get("metrics"))

        # if user is system, connect as nobody
        if record_user == "system":
            connect_user = "nobody"
        else:
            connect_user = record_user

        if record_savedsearch_name.startswith("_ACCELERATE"):
            return self.yield_default_record(
                tenant_id,
                record_object,
                record_object_id,
                account,
                record_app,
                record_user,
                record_savedsearch_name,
                "Not applicable for datamodel acceleration searches",
            )

        else:

            # check if record_savedsearch_name contains backslashes replaced with unicode, if so, decode it
            if "\\u005c" in record_savedsearch_name:
                record_savedsearch_name = record_savedsearch_name.replace(
                    "\\u005c", "\\"
                )

            if account != "local":
                # get account
                account_dict = self.get_account(session_key, server_rest_uri, account)
                splunk_url = account_dict["splunk_url"]
                bearer_token = account_dict["token"]

                # Create a session within the generate function
                session = requests.Session()

                # Call target selector and pass the session as an argument
                selected_url, errors = self.select_url(session, splunk_url)

                # end of get configuration

                # If none of the endpoints could be reached
                if not selected_url:
                    error_msg = "None of the endpoints provided in the account URLs could be reached successfully, verify your network connectivity!"
                    logging.error(error_msg)
                    remote_service_established = None

                else:
                    # Enforce https and remove trailing slash in the URL, if any
                    selected_url = (
                        f"https://{selected_url.replace('https://', '').rstrip('/')}"
                    )

                    # use urlparse to extract relevant info from target
                    parsed_url = urllib.parse.urlparse(selected_url)

                    # Establish the remote service
                    logging.debug(
                        f'Establishing connection to host="{parsed_url.hostname}" on port="{parsed_url.port}"'
                    )

                    # establish connectivity
                    (
                        remote_service_established,
                        service,
                        header,
                    ) = self.establish_remote_service(
                        selected_url,
                        bearer_token,
                        connect_user,
                        record_app,
                        account,
                    )

                    # will fail if the user does not exist anymore, then connect as nobody
                    if not remote_service_established:

                        logging.info(
                            f'connection has failed for user="{record_user}", retrying with "nobody", this is expected if we have a low level of privileges.'
                        )

                        (
                            remote_service_established,
                            service,
                            header,
                        ) = self.establish_remote_service(
                            selected_url,
                            bearer_token,
                            "nobody",
                            record_app,
                            account,
                        )

            else:
                # local connectivity
                logging.debug("establish local connectivity")
                selected_url, service, header = self.establish_local_service(
                    session_key, connect_user, record_app
                )

                # will fail if the user does not exist anymore, then connect as nobody
                if not service:
                    selected_url, service, header = self.establish_local_service(
                        session_key, "nobody", record_app
                    )

                if service:
                    logging.debug("local connectivity established successfully")

            # Start logic
            if account == "local" or remote_service_established:
                # Versioning collection
                versioning_collection_name = "kv_trackme_wlk_versioning_tenant_%s" % (
                    self.tenant_id
                )
                versioning_collection = self.service.kvstore[versioning_collection_name]

                # Orphan collection
                orphan_collection_name = "kv_trackme_wlk_orphan_status_tenant_%s" % (
                    self.tenant_id
                )
                orphan_collection = self.service.kvstore[orphan_collection_name]

                # process
                try:
                    logging.debug(
                        f"processing record_savedsearch_name={record_savedsearch_name}"
                    )
                    savedsearch = service.saved_searches[record_savedsearch_name]

                    # debug
                    logging.debug(
                        f'savedsearch="{savedsearch.name}", alternate="{savedsearch.links["alternate"]}"'
                    )

                    # record

                    # init
                    savedsearch_owner = None
                    savedsearch_sharing = None
                    savedsearch_orphan = None
                    version_id = None

                    #
                    # check orphan & retrieve acl
                    #

                    if self.check_orphan:
                        record_url = "%s/%s/%s" % (
                            selected_url,
                            savedsearch.links["alternate"],
                            "?add_orphan_field=yes&output_mode=json",
                        )
                    else:
                        record_url = "%s/%s/%s" % (
                            selected_url,
                            savedsearch.links["alternate"],
                            "/acl/list?output_mode=json",
                        )

                    try:
                        response = session.get(record_url, headers=header, verify=False)
                        savedsearch_content = json.loads(response.text).get("entry")[0][
                            "content"
                        ]
                        savedsearch_acl = json.loads(response.text).get("entry")[0][
                            "acl"
                        ]
                        savedsearch_owner = savedsearch_acl.get("owner")
                        savedsearch_app = savedsearch_acl.get("app")
                        savedsearch_sharing = savedsearch_acl.get("sharing")
                        savedsearch_orphan = savedsearch_content.get("orphan")
                        logging.debug(
                            f'get extended metadata for savedsearch="{savedsearch.name}" successful, orphan="{savedsearch_orphan}", acl="{json.dumps(savedsearch_acl, indent=2)}"'
                        )

                    except Exception as e:
                        logging.error(
                            f'get extended metadata for savedsearch="{savedsearch.name}" error, exception="{str(e)}"'
                        )
                        return self.yield_default_record(
                            tenant_id,
                            record_object,
                            record_object_id,
                            account,
                            record_app,
                            record_user,
                            record_savedsearch_name,
                            f'get extended metadata for savedsearch="{savedsearch.name}" error, exception="{str(e)}"',
                        )

                    # if check orphan
                    if self.check_orphan:
                        # Define the KV query
                        query_string = {
                            "_key": record_object_id,
                        }

                        try:
                            currentorphanrecord = orphan_collection.data.query(
                                query=json.dumps(query_string)
                            )[0]
                        except Exception as e:
                            currentorphanrecord = None

                        # set the orphan record
                        neworphanrecord = {
                            "_key": record_object_id,
                            "mtime": time.time(),
                            "object": record_object,
                            "app": record_app,
                            "user": record_user,
                            "orphan": savedsearch_orphan,
                        }

                        # update or insert
                        try:
                            if not currentorphanrecord:
                                # Register a new record
                                orphan_collection.data.insert(
                                    json.dumps(neworphanrecord)
                                )
                                # Update the existing record
                            else:
                                orphan_collection.data.update(
                                    record_object_id, json.dumps(neworphanrecord)
                                )
                        except Exception as e:
                            logging.error(
                                f'failed to update or insert the orphan collection record="{json.dumps(neworphanrecord, indent=2)}", exception="{str(e)}"'
                            )

                    # mandatory, stop here if we cannot retrieve the search
                    try:
                        savedsearch_search = savedsearch.content["search"]
                        savedsearch_content = savedsearch.content
                    except Exception as e:
                        logging.error(
                            f'failed to retrieve savedsearch content for savedsearch="{record_savedsearch_name}" we might not have enouch permissions to do so, exception="{str(e)}"'
                        )
                        return self.yield_default_record(
                            tenant_id,
                            record_object,
                            record_object_id,
                            account,
                            record_app,
                            record_user,
                            record_savedsearch_name,
                            f'failed to retrieve savedsearch content for savedsearch="{record_savedsearch_name}", exception="{str(e)}"',
                        )

                    # do not fail for the following
                    savedsearch_cron_schedule = savedsearch_content.get("cron_schedule")
                    savedsearch_description = savedsearch_content.get("description")
                    savedsearch_disabled = savedsearch_content.get("disabled")
                    savedsearch_is_scheduled = savedsearch_content.get("is_scheduled")
                    savedsearch_schedule_window = savedsearch_content.get(
                        "schedule_window"
                    )
                    savedsearch_workload_pool = savedsearch_content.get("workload_pool")
                    savedsearch_earliest_time = savedsearch_content.get(
                        "dispatch.earliest_time"
                    )
                    savedsearch_latest_time = savedsearch_content.get(
                        "dispatch.latest_time"
                    )

                    # set the version_id using configurable keys
                    # splk_general_workload_version_id_keys is already a list from upstream processing
                    try:
                        if isinstance(splk_general_workload_version_id_keys, list):
                            version_id_keys = [key.strip() for key in splk_general_workload_version_id_keys if key and key.strip()]
                        else:
                            version_id_keys = [key.strip() for key in splk_general_workload_version_id_keys.split(",") if key and key.strip()]

                        # Map configuration keys to their corresponding savedsearch values
                        key_mapping = {
                            "search": savedsearch_search,
                            "dispatch.earliest": savedsearch_earliest_time,
                            "dispatch.latest": savedsearch_latest_time,
                        }

                        # Build version_hash using the configured keys
                        version_values = []
                        for key in version_id_keys:
                            try:
                                if key in key_mapping:
                                    # Use the mapped value for the 3 default keys
                                    value = key_mapping[key]
                                    version_values.append(str(value) if value is not None else "")
                                elif '*' in key or '?' in key:
                                    # Wildcard pattern - find all matching keys
                                    if savedsearch_content:
                                        matching_keys = [k for k in savedsearch_content.keys() if fnmatch.fnmatch(k, key)]
                                        matching_keys.sort()  # Sort for consistent ordering
                                        for matching_key in matching_keys:
                                            value = savedsearch_content.get(matching_key, "")
                                            version_values.append(str(value) if value is not None else "")
                                    else:
                                        # If no savedsearch_content, add empty string for consistency
                                        version_values.append("")
                                else:
                                    # For other keys, try to get the value directly from savedsearch_content
                                    value = savedsearch_content.get(key, "")
                                    version_values.append(str(value) if value is not None else "")
                            except Exception as e:
                                # If there's an error processing a specific key, use empty string and log
                                logging.warning(f'Error processing version_id key "{key}" for savedsearch "{savedsearch.name}": {str(e)}')
                                version_values.append("")

                        version_hash = ":".join(version_values)
                    except Exception as e:
                        # Fallback to default behavior if there's an error with the configurable keys
                        logging.error(f'Error processing version_id keys for savedsearch "{savedsearch.name}": {str(e)}, falling back to default')
                        version_hash = "%s:%s:%s" % (
                            savedsearch_search or "",
                            savedsearch_earliest_time or "",
                            savedsearch_latest_time or "",
                        )
                    version_id = hashlib.sha256(
                        version_hash.encode("utf-8")
                    ).hexdigest()

                    # get the cron_exec_sequence_sec
                    try:
                        cron_exec_sequence_sec = cron_to_seconds(
                            savedsearch_cron_schedule
                        )
                    except Exception as e:
                        cron_exec_sequence_sec = 0

                    # set the json_data
                    json_data = {
                        "time_inspected": time.strftime(
                            "%d %b %Y %H:%M", time.localtime(time.time())
                        ),
                        "time_inspected_epoch": time.time(),
                        "savedsearch_name": savedsearch.name,
                        "search": savedsearch_search,
                        "earliest_time": savedsearch_earliest_time,
                        "latest_time": savedsearch_latest_time,
                        "cron_schedule": savedsearch_cron_schedule,
                        "cron_exec_sequence_sec": cron_exec_sequence_sec,
                        "description": savedsearch_description,
                        "disabled": savedsearch_disabled,
                        "is_scheduled": savedsearch_is_scheduled,
                        "schedule_window": savedsearch_schedule_window,
                        "workload_pool": savedsearch_workload_pool,
                        "app": savedsearch_app,
                        "owner": savedsearch_owner,
                        "sharing": savedsearch_sharing,
                        "metrics_summary": record_metrics,
                        "version_id": version_id,
                    }

                    if self.check_orphan:
                        json_data["orphan"] = savedsearch_orphan

                    # try find in the dict last_updates_dict the last known update for this savedsearch (time and user)
                    try:
                        if record_savedsearch_name in last_updates_dict:
                            last_update = last_updates_dict[record_savedsearch_name]
                            json_data["last_update_time_epoch"] = last_update["time"]
                            # create a last_update_time_human which is epoch strftime %c
                            json_data["last_update_time_human"] = time.strftime(
                                "%c", time.localtime(float(last_update["time"]))
                            )
                            json_data["last_update_user"] = last_update["user"]
                    except Exception as e:
                        logging.error(
                            f'failed to retrieve last update info for savedsearch="{record_savedsearch_name}", exception="{str(e)}"'
                        )

                    # empty json_dict
                    json_dict = {}

                    # if it exists already, update the KVstore record, otherwise create a new record
                    # if the record exists already, we also need to update the dictionnary
                    if self.context in ("live"):
                        try:
                            if not kvrecordkey:
                                json_dict[version_id] = json_data
                                sorted_json_dict = self.sort_json_by_epoch(json_dict)
                                versioning_collection.data.insert(
                                    json.dumps(
                                        {
                                            "_key": record_object_id,
                                            "mtime": time.time(),
                                            "object": record_object,
                                            "version_dict": json.dumps(
                                                sorted_json_dict, indent=2
                                            ),
                                            "description": savedsearch_description,
                                            "current_version_id": version_id,
                                            "cron_exec_sequence_sec": cron_exec_sequence_sec,
                                        }
                                    )
                                )

                                # ingest
                                self.ingest_version(
                                    record_object,
                                    splunk_index,
                                    splunk_sourcetype,
                                    splunk_source,
                                    json_data,
                                )

                            else:
                                # update
                                search_change_detected = False
                                if not version_id in kvrecorddict:
                                    search_change_detected = True

                                # get the last currently known record for that instance
                                sorted_json_dict = self.sort_json_by_epoch(kvrecorddict)
                                last_known_record = sorted_json_dict[
                                    list(sorted_json_dict)[0]
                                ]

                                # get the last known earliest_time, latest_time, search from last_known_record
                                last_known_earliest_time = last_known_record.get(
                                    "earliest_time"
                                )
                                last_known_latest_time = last_known_record.get(
                                    "latest_time"
                                )
                                last_known_search = last_known_record.get("search")

                                kvrecorddict[version_id] = json_data
                                sorted_json_dict = self.sort_json_by_epoch(kvrecorddict)

                                # for each configured key, compare with the current record and create a diff_<field>
                                try:
                                    # Map configuration keys to their corresponding savedsearch values and last known values
                                    key_mapping = {
                                        "search": (savedsearch_search, last_known_search),
                                        "dispatch.earliest": (savedsearch_earliest_time, last_known_earliest_time),
                                        "dispatch.latest": (savedsearch_latest_time, last_known_latest_time),
                                    }

                                    # Generate diff strings for all configured keys
                                    for key in version_id_keys:
                                        try:
                                            if key in key_mapping:
                                                # Use the mapped values for the 3 default keys
                                                current_value, last_known_value = key_mapping[key]

                                                # Generate diff string if values are different (including empty to non-empty transitions)
                                                # Normalize values for comparison (treat None and empty string as equivalent)
                                                last_known_normalized = str(last_known_value).strip() if last_known_value is not None else ""
                                                current_normalized = str(current_value).strip() if current_value is not None else ""

                                                if last_known_normalized != current_normalized:
                                                    diff_string = self.generate_diff_string(
                                                        last_known_value, current_value
                                                    )
                                                    # Create diff field name by replacing dots with underscores and adding diff_ prefix
                                                    diff_field_name = f"diff_{key.replace('.', '_')}"
                                                    json_data[diff_field_name] = diff_string

                                            elif '*' in key or '?' in key:
                                                # Wildcard pattern - find all matching keys and generate diff for each
                                                if savedsearch_content:
                                                    matching_keys = [k for k in savedsearch_content.keys() if fnmatch.fnmatch(k, key)]
                                                    matching_keys.sort()  # Sort for consistent ordering

                                                    for matching_key in matching_keys:
                                                        current_value = savedsearch_content.get(matching_key, "") if savedsearch_content else ""
                                                        last_known_value = last_known_record.get(matching_key) if last_known_record else None

                                                        # Generate diff string if values are different (including empty to non-empty transitions)
                                                        # Normalize values for comparison (treat None and empty string as equivalent)
                                                        last_known_normalized = str(last_known_value).strip() if last_known_value is not None else ""
                                                        current_normalized = str(current_value).strip() if current_value is not None else ""

                                                        if last_known_normalized != current_normalized:
                                                            diff_string = self.generate_diff_string(
                                                                last_known_value, current_value
                                                            )
                                                            # Create diff field name by replacing dots with underscores and adding diff_ prefix
                                                            diff_field_name = f"diff_{matching_key.replace('.', '_')}"
                                                            json_data[diff_field_name] = diff_string
                                            else:
                                                # For other keys, get values directly from savedsearch_content and last_known_record
                                                current_value = savedsearch_content.get(key, "") if savedsearch_content else ""
                                                last_known_value = last_known_record.get(key) if last_known_record else None

                                                # Generate diff string if values are different (including empty to non-empty transitions)
                                                # Normalize values for comparison (treat None and empty string as equivalent)
                                                last_known_normalized = str(last_known_value).strip() if last_known_value is not None else ""
                                                current_normalized = str(current_value).strip() if current_value is not None else ""

                                                if last_known_normalized != current_normalized:
                                                    diff_string = self.generate_diff_string(
                                                        last_known_value, current_value
                                                    )
                                                    # Create diff field name by replacing dots with underscores and adding diff_ prefix
                                                    diff_field_name = f"diff_{key.replace('.', '_')}"
                                                    json_data[diff_field_name] = diff_string
                                        except Exception as e:
                                            # If there's an error processing a specific key, log and continue
                                            logging.warning(f'Error generating diff for key "{key}" for savedsearch "{savedsearch.name}": {str(e)}')
                                            continue
                                except Exception as e:
                                    # If there's an error with the entire diff generation, log and continue
                                    logging.error(f'Error in diff string generation for savedsearch "{savedsearch.name}": {str(e)}')

                                # ingest if a change is detected
                                if search_change_detected:
                                    # ingest
                                    self.ingest_version(
                                        record_object,
                                        splunk_index,
                                        splunk_sourcetype,
                                        splunk_source,
                                        json_data,
                                    )

                                # otherwise and if we have diff information for this record, make sure to preserve it
                                else:
                                    # Carry over last update and diff information if no change is detected
                                    if "last_update_time_epoch" in last_known_record:
                                        json_data["last_update_time_epoch"] = (
                                            last_known_record["last_update_time_epoch"]
                                        )
                                    if "last_update_time_human" in last_known_record:
                                        json_data["last_update_time_human"] = (
                                            last_known_record["last_update_time_human"]
                                        )
                                    if "last_update_user" in last_known_record:
                                        json_data["last_update_user"] = (
                                            last_known_record["last_update_user"]
                                        )
                                    if "diff_search" in last_known_record:
                                        json_data["diff_search"] = last_known_record[
                                            "diff_search"
                                        ]

                                # update the KVstore record
                                versioning_collection.data.update(
                                    record_object_id,
                                    {
                                        "_key": record_object_id,
                                        "mtime": time.time(),
                                        "object": record_object,
                                        "version_dict": json.dumps(
                                            sorted_json_dict, indent=2
                                        ),
                                        "description": savedsearch_description,
                                        "current_version_id": version_id,
                                        "cron_exec_sequence_sec": cron_exec_sequence_sec,
                                    },
                                )

                        except Exception as e:
                            logging.error(
                                f'tenant_id="{tenant_id}", object="{record_object}", failure while trying to insert the hybrid KVstore record, exception="{e}"'
                            )

                    # return the final record
                    return {
                        "_time": time.time(),
                        "tenant_id": tenant_id,
                        "object": record_object,
                        "object_id": record_object_id,
                        "account": account,
                        "app": record_app,
                        "user": record_user,
                        "savedsearch_name": record_savedsearch_name,
                        "search": savedsearch_search,
                        "earliest_time": savedsearch_earliest_time,
                        "latest_time": savedsearch_latest_time,
                        "cron_schedule": savedsearch_cron_schedule,
                        "cron_exec_sequence_sec": cron_exec_sequence_sec,
                        "description": savedsearch_description,
                        "disabled": savedsearch_disabled,
                        "is_scheduled": savedsearch_is_scheduled,
                        "schedule_window": savedsearch_schedule_window,
                        "workload_pool": savedsearch_workload_pool,
                        "owner": savedsearch_owner,
                        "sharing": savedsearch_sharing,
                        "metrics": json.dumps(record_metrics, indent=2),
                        "message": "saved search metadata were retrieved successfully",
                        "version_id": version_id,
                        "json_data": json_data,
                    }

                except Exception as e:
                    # Use the new function to yield the default record when an error occurs
                    return self.yield_default_record(
                        tenant_id,
                        record_object,
                        record_object_id,
                        account,
                        record_app,
                        record_user,
                        record_savedsearch_name,
                        f'failed to retrieve saved search metadata, if the report was recently deleted then this is expected and will disappear shortly, exception="{str(e)}"',
                    )

            else:
                # Use the new function to yield the default record when an error occurs
                return self.yield_default_record(
                    tenant_id,
                    record_object,
                    record_object_id,
                    account,
                    record_app,
                    record_user,
                    record_savedsearch_name,
                    "failed to retrieve saved search metadata, if the report was recently deleted then this is expected and will disappear shortly",
                )

    # main
    def stream(self, records):
        if self:
            # start perf duration counter
            start = time.time()

            # Add records in a proper list rather than the builtin generator to address some issues with complex savedsearch names
            records_list = []
            for record in records:
                records_list.append(record)

            # Track execution times
            average_execution_time = 0

            # max runtime
            max_runtime = int(self.max_runtime_sec)

            # Retrieve the search cron schedule
            savedsearch_name = self.report.replace("_wrapper", "_tracker")
            savedsearch = self.service.saved_searches[savedsearch_name]
            savedsearch_cron_schedule = savedsearch.content["cron_schedule"]

            # get the cron_exec_sequence_sec
            try:
                cron_exec_sequence_sec = int(cron_to_seconds(savedsearch_cron_schedule))
            except Exception as e:
                logging.error(
                    f'tenant_id="{self.tenant_id}", component="splk-wlk", failed to convert the cron schedule to seconds, error="{str(e)}"'
                )
                cron_exec_sequence_sec = max_runtime

            # the max_runtime cannot be bigger than the cron_exec_sequence_sec
            if max_runtime > cron_exec_sequence_sec:
                max_runtime = cron_exec_sequence_sec

            logging.info(
                f'max_runtime="{max_runtime}",  savedsearch_name="{savedsearch_name}", savedsearch_cron_schedule="{savedsearch_cron_schedule}", cron_exec_sequence_sec="{cron_exec_sequence_sec}"'
            )

            # Get request info and set logging level
            reqinfo = trackme_reqinfo(
                self._metadata.searchinfo.session_key,
                self._metadata.searchinfo.splunkd_uri,
            )
            log.setLevel(reqinfo["logging_level"])

            # Get the session key
            session_key = self._metadata.searchinfo.session_key

            # Get splunkd_port
            local_splunkd_port = urllib.parse.urlparse(
                self._metadata.searchinfo.splunkd_uri
            ).port

            # from global configuration, get the value for splk_general_workload_version_id_keys
            splk_general_workload_version_id_keys = reqinfo["trackme_conf"]["splk_general"]["splk_general_workload_version_id_keys"]

            # split the value into a list
            splk_general_workload_version_id_keys = splk_general_workload_version_id_keys.split(",")

            # list of forbidden apps
            exclude_apps = set(self.exclude_apps.split(","))

            # Versioning collection
            versioning_collection_name = "kv_trackme_wlk_versioning_tenant_%s" % (
                self.tenant_id
            )
            versioning_collection = self.service.kvstore[versioning_collection_name]

            # Get configuration and define metadata
            trackme_summary_idx = reqinfo["trackme_conf"]["index_settings"][
                "trackme_summary_idx"
            ]
            splunk_index = trackme_summary_idx
            splunk_sourcetype = "trackme:wlk:version_id"
            splunk_source = "trackme_ingest_version"

            # end of get configuration

            #
            # before processing the records, loop in records and get the first value for account in the first record
            #

            target_account = None
            for record in records_list:
                target_account = record.get("account")
                break

            try:
                last_updates_dict = self.get_last_updates(
                    session_key,
                    reqinfo["server_rest_uri"],
                    target_account,
                )

            except Exception as e:
                logging.error(
                    f'tenant_id="{self.tenant_id}", component="splk-wlk", Failed to call get_last_updates with exception="{str(e)}"'
                )
                last_updates_dict = {}

            #
            # loop through upstream records
            #

            # Initialize sum of execution times and count of iterations
            total_execution_time = 0
            iteration_count = 0

            # Other initializations
            max_runtime = int(self.max_runtime_sec)

            # for the handler events
            report_objects_dict = {}

            with requests.Session() as session:
                # Loop in the results
                for record in records_list:

                    # iteration start
                    iteration_start_time = time.time()

                    if not record.get("app") in exclude_apps:
                        # object_id
                        record_object_id = record.get("object_id")

                        # add the object_id to the report_objects_dict
                        report_objects_dict[record_object_id] = record.get("object")

                        # Try to get the KVstore record
                        kvrecord, kvrecordkey, kvrecorddict = self.get_kv_record(
                            versioning_collection, record_object_id
                        )

                        # Process the saved search and yield the result
                        yield self.process_savedsearch(
                            session,
                            record,
                            kvrecordkey,
                            kvrecorddict,
                            local_splunkd_port,
                            session_key,
                            reqinfo["server_rest_uri"],
                            splunk_index,
                            splunk_sourcetype,
                            splunk_source,
                            last_updates_dict,
                            splk_general_workload_version_id_keys,
                        )

                    # Calculate the execution time for this iteration
                    iteration_end_time = time.time()
                    execution_time = iteration_end_time - iteration_start_time

                    # Update total execution time and iteration count
                    total_execution_time += execution_time
                    iteration_count += 1

                    # Calculate average execution time
                    if iteration_count > 0:
                        average_execution_time = total_execution_time / iteration_count
                    else:
                        average_execution_time = 0

                    # Check if there is enough time left to continue
                    current_time = time.time()
                    elapsed_time = current_time - start
                    if elapsed_time + average_execution_time + 120 >= max_runtime:
                        logging.info(
                            f'tenant_id="{self.tenant_id}", component="splk-wlk", max_runtime="{max_runtime}" is about to be reached, current_runtime="{elapsed_time}", job will be terminated now'
                        )
                        break

                # handler event
                if report_objects_dict:

                    handler_events_records = []
                    for (
                        report_object_id,
                        report_object_name,
                    ) in report_objects_dict.items():
                        handler_events_records.append(
                            {
                                "object": report_object_name,
                                "object_id": report_object_id,
                                "object_category": "splk-wlk",
                                "handler": self.report,
                                "handler_message": "Entity was inspected by an hybrid tracker.",
                                "handler_troubleshoot_search": f"index=_internal sourcetype=trackme:custom_commands:trackmesplkwlkgetreportsdefstream tenant_id={self.tenant_id}",
                                "handler_time": time.time(),
                            }
                        )

                    # notification event
                    try:
                        trackme_handler_events(
                            session_key=self._metadata.searchinfo.session_key,
                            splunkd_uri=self._metadata.searchinfo.splunkd_uri,
                            tenant_id=self.tenant_id,
                            sourcetype="trackme:handler",
                            source=f"trackme:handler:{self.tenant_id}",
                            handler_events=handler_events_records,
                        )
                    except Exception as e:
                        logging.error(
                            f'tenant_id="{self.tenant_id}", component="splk-wlk", could not send notification event, exception="{e}"'
                        )

                if self.report:
                    logging.info(
                        f'trackmesplkwlkgetreportsdefstream has terminated, report="{self.report}", run_time="{round(time.time() - start, 3)}"'
                    )
                else:
                    logging.info(
                        f'trackmesplkwlkgetreportsdefstream has terminated, run_time="{round(time.time() - start, 3)}"'
                    )


dispatch(SplkWlkGetReportsDef, sys.argv, sys.stdin, sys.stdout, __name__)