Splunk_Deploiement/apps/trackme/bin/trackmefieldsquality.py

#!/usr/bin/env python
# coding=utf-8

__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"

import os
import sys
import time
from datetime import datetime, timezone
import json
import logging
import re
from logging.handlers import RotatingFileHandler
import urllib3
import hashlib

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

splunkhome = os.environ["SPLUNK_HOME"]

# set logging
filehandler = RotatingFileHandler(
    "%s/var/log/splunk/trackme_trackmefieldsquality.log" % splunkhome,
    mode="a",
    maxBytes=10000000,
    backupCount=1,
)
formatter = logging.Formatter(
    "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger()  # root logger - Good to get it only once.
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr, logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)  # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)

# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# import libs
import import_declare_test

# import Splunk libs
from splunklib.searchcommands import (
    dispatch,
    StreamingCommand,
    Configuration,
    Option,
    validators,
)

# import trackme libs
from trackme_libs import (
    trackme_reqinfo,
    run_splunk_search,
)

# import trackme licensing libs
from trackme_libs_licensing import trackme_check_license

# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces, strict_interpret_boolean


@Configuration(distributed=False)
class TrackMeFieldsQuality(StreamingCommand):

    fields_to_check_list = Option(
        doc="""
        **Syntax:** **fields_to_check_list=****
        **Description:** The list of fields to verified, provided as an argument to the command in a comma separated list.""",
        require=False,
        default=None,
        validate=validators.Match("fields_to_check_list", r"^.*$"),
    )

    fields_to_check_search_command = Option(
        doc="""
        **Syntax:** **fields_to_check_search_command=****
        **Description:** The search command to use to generate the dictionary of fields to check.""",
        require=False,
        default=None,
        validate=validators.Match("fields_to_check_search_command", r"^.*$"),
    )

    fields_to_check_fieldname = Option(
        doc="""
        **Syntax:** **fields_to_check_fieldname=****
        **Description:** Alternatively, the name of the field containing the list of fields to check, provided in a comma separated list.""",
        require=False,
        default=None,
        validate=validators.Match("fields_to_check_fieldname", r"^.*$"),
    )

    fields_to_check_dict = Option(
        doc="""
        **Syntax:** **fields_to_check_dict=****
        **Description:** A JSON string containing a dictionary of fields to check with optional regex patterns and validation settings.
        Example: {"field1": {"name": "field1", "regex": "^[A-Z]+$", "allow_unknown": true, "allow_empty_or_missing": false}, "field2": {"name": "field2"}}""",
        require=False,
        default=None,
        validate=validators.Match("fields_to_check_dict", r"^.*$"),
    )

    fields_to_check_dict_path = Option(
        doc="""
        **Syntax:** **fields_to_check_dict_path=****
        **Description:** Path to a JSON file containing a dictionary of fields to check with optional regex patterns and validation settings.
        Example: $SPLUNK_HOME/etc/apps/trackme/lookups/fields_config.json""",
        require=False,
        default=None,
        validate=validators.Match("fields_to_check_dict_path", r"^.*$"),
    )

    fields_to_check_dict_fieldname = Option(
        doc="""
        **Syntax:** **fields_to_check_dict_fieldname=****
        **Description:** The name of the field containing a JSON string with a dictionary of fields to check with optional regex patterns and validation settings.
        """,
        require=False,
        default=None,
        validate=validators.Match("fields_to_check_dict_fieldname", r"^.*$"),
    )

    include_field_values = Option(
        doc="""
        **Syntax:** **include_field_values=****
        **Description:** Boolean option to include field values in the JSON summary.
        """,
        require=False,
        default=False,
        validate=validators.Boolean(),
    )

    pretty_print_json = Option(
        doc="""
        **Syntax:** **pretty_print_json=****
        **Description:** Boolean option to pretty print the JSON summary. Default is True.
        """,
        require=False,
        default=True,
        validate=validators.Boolean(),
    )

    output_mode = Option(
        doc="""
        **Syntax:** **output_mode=****
        **Description:** The mode to output the results. Default is json, valid options are json and raw.
        """,
        require=False,
        default="json",
        validate=validators.Match("output_mode", r"^json|raw$"),
    )

    metadata_fields = Option(
        doc="""
        **Syntax:** **metadata_fields=****
        **Description:** A CSV list of metadata fields to include in the metadata section of the JSON when using output_mode=json. index/sourcetype/host/source are always included, you can add others to be included in the metadata section.
        """,
        require=False,
        default="index,sourcetype,host,source",
        validate=validators.Match("metadata_fields", r"^.*$"),
    )

    summary_fieldname = Option(
        doc="""
        **Syntax:** **summary_fieldname=****
        **Description:** Defines the name of the summary field. Default is 'summary'.
        """,
        require=False,
        default="summary",
        validate=validators.Match("summary_fieldname", r"^.*$"),
    )

    metadata_fieldname = Option(
        doc="""
        **Syntax:** **metadata_fieldname=****
        **Description:** Defines the name of the metadata field added to the summary JSON. Default is 'metadata'.
        """,
        require=False,
        default="metadata",
        validate=validators.Match("metadata_fieldname", r"^.*$"),
    )

    # status will be statically defined as imported

    def stream(self, records):

        # Start performance counter
        start = time.time()

        # Get request info and set logging level
        reqinfo = trackme_reqinfo(
            self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
        )
        log.setLevel(reqinfo["logging_level"])

        # check license state
        try:
            check_license = trackme_check_license(
                reqinfo["server_rest_uri"], self._metadata.searchinfo.session_key
            )
            license_is_valid = check_license.get("license_is_valid")
            logging.debug(
                f'function check_license called, response="{json.dumps(check_license, indent=2)}"'
            )

        except Exception as e:
            license_is_valid = 0
            logging.error(f'function check_license exception="{str(e)}"')

        # check restricted components
        if license_is_valid != 1:
            logging.error(
                f'The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, check_license="{json.dumps(check_license, indent=2)}"'
            )
            raise Exception(
                f"The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, please contact your Splunk administrator."
            )

        # either fields_to_check_list or fields_to_check_fieldname must be provided, but not both
        if (
            sum(
                1
                for x in [
                    self.fields_to_check_list,
                    self.fields_to_check_fieldname,
                    self.fields_to_check_dict,
                    self.fields_to_check_dict_path,
                    self.fields_to_check_dict_fieldname,
                    self.fields_to_check_search_command,
                ]
                if x
            )
            > 1
        ):
            raise ValueError(
                "Only one of fields_to_check_list, fields_to_check_fieldname, fields_to_check_dict, fields_to_check_dict_path, fields_to_check_dict_fieldname or fields_to_check_search_command can be provided"
            )

        # if fields_to_check_search_command is provided, run the search command, load the json_dict field from the results and use it as the fields_to_check_dict
        json_dict = None
        if self.fields_to_check_search_command:
            try:
                reader = run_splunk_search(
                    self.service,
                    remove_leading_spaces(self.fields_to_check_search_command),
                    {
                        "earliest_time": "-5m",
                        "latest_time": "now",
                        "preview": "false",
                        "output_mode": "json",
                        "count": 0,
                    },
                    24,
                    5,
                )

                for item in reader:
                    if isinstance(item, dict):

                        # raise an exception if the json_dict field is not present
                        if "json_dict" not in item:
                            raise ValueError(
                                f"json_dict field not found in the search results for search command: {self.fields_to_check_search_command}"
                            )

                        # load the json_dict field
                        json_dict = json.loads(item["json_dict"])

                        # only one result is expected
                        break

            except Exception as e:
                error_msg = f'context="error", trackmefieldsquality has failed with exception="{str(e)}"'
                logging.error(error_msg)
                raise Exception(error_msg)

        # Loop in the results
        records_count = 0
        for record in records:
            records_count += 1

            yield_record = {}
            json_summary = {"time": float(record.get("_time", time.time()))}

            # Get the list of fields from fields_to_check_list
            if self.fields_to_check_list:
                fields_to_check = self.fields_to_check_list.split(",")
                fields_dict = {
                    field.strip(): {"name": field.strip()} for field in fields_to_check
                }

            # Get the list of fields from fields_to_check_fieldname
            elif self.fields_to_check_fieldname:
                fields_to_check = record.get(self.fields_to_check_fieldname)
                # check if fields_to_check is a list, if so keep the first item only
                if isinstance(fields_to_check, list):
                    fields_to_check = fields_to_check[0]
                fields_to_check = fields_to_check.split(",")
                fields_dict = {
                    field.strip(): {"name": field.strip()} for field in fields_to_check
                }

            # Get fields from fields_to_check_dict
            elif self.fields_to_check_dict:
                try:
                    fields_dict = json.loads(self.fields_to_check_dict)
                    # Validate the structure
                    for field_name, field_info in fields_dict.items():
                        if not isinstance(field_info, dict):
                            raise ValueError(f"Field {field_name} must be a dictionary")
                        if "name" not in field_info:
                            raise ValueError(
                                f"Field {field_name} must have a 'name' property"
                            )
                        if not isinstance(field_info["name"], str):
                            raise ValueError(
                                f"Field {field_name} name must be a string"
                            )
                        if "regex" in field_info and not isinstance(
                            field_info["regex"], str
                        ):
                            raise ValueError(
                                f"Field {field_name} regex must be a string if provided"
                            )
                except json.JSONDecodeError:
                    raise ValueError(
                        f"Invalid JSON format in fields_to_check_dict: {self.fields_to_check_dict}"
                    )

            # Get fields from fields_to_check_dict_path
            elif self.fields_to_check_dict_path:
                try:
                    # Handle relative paths from SPLUNK_HOME
                    if not os.path.isabs(self.fields_to_check_dict_path):
                        file_path = os.path.join(
                            splunkhome, self.fields_to_check_dict_path
                        )
                    else:
                        file_path = self.fields_to_check_dict_path

                    if not os.path.exists(file_path):
                        raise ValueError(f"JSON file not found: {file_path}")

                    with open(file_path, "r") as f:
                        fields_dict = json.load(f)

                    # Validate the structure
                    for field_name, field_info in fields_dict.items():
                        if not isinstance(field_info, dict):
                            raise ValueError(f"Field {field_name} must be a dictionary")
                        if "name" not in field_info:
                            raise ValueError(
                                f"Field {field_name} must have a 'name' property"
                            )
                        if not isinstance(field_info["name"], str):
                            raise ValueError(
                                f"Field {field_name} name must be a string"
                            )
                        if "regex" in field_info and not isinstance(
                            field_info["regex"], str
                        ):
                            raise ValueError(
                                f"Field {field_name} regex must be a string if provided"
                            )
                except json.JSONDecodeError:
                    raise ValueError(
                        f"Invalid JSON format in file: {self.fields_to_check_dict_path}"
                    )
                except IOError as e:
                    raise ValueError(f"Error reading JSON file: {str(e)}")

            # Get fields from fields_to_check_dict_fieldname
            elif self.fields_to_check_dict_fieldname:
                try:
                    json_string = record.get(self.fields_to_check_dict_fieldname)
                    # check if json_string is a list, if so keep the first item only
                    if isinstance(json_string, list):
                        json_string = json_string[0]
                    fields_dict = json.loads(json_string)
                    # Validate the structure
                    for field_name, field_info in fields_dict.items():
                        if not isinstance(field_info, dict):
                            raise ValueError(f"Field {field_name} must be a dictionary")
                        if "name" not in field_info:
                            raise ValueError(
                                f"Field {field_name} must have a 'name' property"
                            )
                        if not isinstance(field_info["name"], str):
                            raise ValueError(
                                f"Field {field_name} name must be a string"
                            )
                        if "regex" in field_info and not isinstance(
                            field_info["regex"], str
                        ):
                            raise ValueError(
                                f"Field {field_name} regex must be a string if provided"
                            )
                except json.JSONDecodeError:
                    raise ValueError(
                        f"Invalid JSON format in fields_to_check_dict_fieldname with field_name: {self.fields_to_check_dict_fieldname} and json_string: {json_string}"
                    )

            elif self.fields_to_check_search_command:
                fields_dict = json_dict

            else:
                fields_dict = {}

            # Initialize counters for summary
            total_fields_checked = 0
            total_fields_failed = 0
            total_fields_passed = 0
            list_fields_passed = []
            list_fields_failed = []

            # Check each field in the dictionary
            for field_info in fields_dict.values():
                field = field_info["name"]

                # Handle reserved fields by renaming them to orig_<field_name>
                # This prevents conflicts with internal reserved fields used for processing output
                reserved_fields = ["metadata", "event_id", "summary"]
                output_field_name = field
                if field in reserved_fields:
                    output_field_name = f"orig_{field}"
                    logging.info(f'context="reserved_field", field="{field}" renamed to "{output_field_name}"')

                regex_pattern = field_info.get("regex")
                allow_unknown = strict_interpret_boolean(field_info.get("allow_unknown", False))
                allow_empty_or_missing = strict_interpret_boolean(field_info.get("allow_empty_or_missing", False))
                field_value = record.get(field)
                total_fields_checked += 1

                # Initialize flags
                is_missing = field_value is None
                is_empty = False
                is_unknown = False
                regex_failure = False

                # Check if field is missing and allow_empty_or_missing is True
                if is_missing:
                    if allow_empty_or_missing:
                        field_summary = {
                            "status": "success",
                            "description": "Field does not exist but is allowed to be missing.",
                            "is_missing": is_missing,
                            "is_empty": is_empty,
                            "is_unknown": is_unknown,
                        }
                        if regex_pattern:
                            field_summary["regex_failure"] = regex_failure
                            field_summary["regex_expression"] = regex_pattern
                        if self.include_field_values:
                            field_summary["value"] = field_value
                        json_summary[output_field_name] = field_summary
                        total_fields_passed += 1
                        list_fields_passed.append(output_field_name)
                        continue
                    else:
                        reason = "does not exist"
                        field_summary = {
                            "status": "failure",
                            "description": f"Field {reason}.",
                            "is_missing": is_missing,
                            "is_empty": is_empty,
                            "is_unknown": is_unknown,
                        }
                        if regex_pattern:
                            field_summary["regex_failure"] = regex_failure
                            field_summary["regex_expression"] = regex_pattern
                        if self.include_field_values:
                            field_summary["value"] = field_value
                        json_summary[output_field_name] = field_summary
                        total_fields_failed += 1
                        list_fields_failed.append(output_field_name)
                        continue

                if isinstance(field_value, list):
                    # Check each item in the list
                    all_items_valid = True
                    for item in field_value:
                        if isinstance(item, str) and item.lower() == "unknown":
                            if not allow_unknown:
                                all_items_valid = False
                                reason = "contains 'unknown'"
                                is_unknown = True
                                break
                        elif regex_pattern and not re.match(regex_pattern, str(item)):
                            # If allow_unknown is True and the value is "unknown", override regex failure
                            if allow_unknown and isinstance(item, str) and item.lower() == "unknown":
                                continue
                            all_items_valid = False
                            reason = "one or more values in the list do not match the required pattern"
                            regex_failure = True
                            break

                    if not all_items_valid:
                        field_summary = {
                            "status": "failure",
                            "description": f"Field exists but {reason}.",
                            "is_missing": is_missing,
                            "is_empty": is_empty,
                            "is_unknown": is_unknown,
                        }
                        if regex_pattern:
                            field_summary["regex_failure"] = regex_failure
                            field_summary["regex_expression"] = regex_pattern
                        if self.include_field_values:
                            field_summary["value"] = field_value
                        json_summary[output_field_name] = field_summary
                        total_fields_failed += 1
                        list_fields_failed.append(output_field_name)
                        continue
                else:
                    # Original behavior for non-list values
                    if field_value == "":
                        if allow_empty_or_missing:
                            field_summary = {
                                "status": "success",
                                "description": "Field is empty but is allowed to be empty.",
                                "is_missing": is_missing,
                                "is_empty": True,
                                "is_unknown": is_unknown,
                            }
                            if regex_pattern:
                                field_summary["regex_failure"] = regex_failure
                                field_summary["regex_expression"] = regex_pattern
                            if self.include_field_values:
                                field_summary["value"] = field_value
                            json_summary[output_field_name] = field_summary
                            total_fields_passed += 1
                            list_fields_passed.append(output_field_name)
                            continue
                        else:
                            reason = "is empty"
                            is_empty = True
                            field_summary = {
                                "status": "failure",
                                "description": f"Field {reason}.",
                                "is_missing": is_missing,
                                "is_empty": is_empty,
                                "is_unknown": is_unknown,
                            }
                            if regex_pattern:
                                field_summary["regex_failure"] = regex_failure
                                field_summary["regex_expression"] = regex_pattern
                            if self.include_field_values:
                                field_summary["value"] = field_value
                            json_summary[output_field_name] = field_summary
                            total_fields_failed += 1
                            list_fields_failed.append(output_field_name)
                            continue
                    elif (
                        isinstance(field_value, str)
                        and field_value.lower() == "unknown"
                    ):
                        if not allow_unknown:
                            reason = "is 'unknown'"
                            is_unknown = True
                            field_summary = {
                                "status": "failure",
                                "description": f"Field {reason}.",
                                "is_missing": is_missing,
                                "is_empty": is_empty,
                                "is_unknown": is_unknown,
                            }
                            if regex_pattern:
                                field_summary["regex_failure"] = regex_failure
                                field_summary["regex_expression"] = regex_pattern
                            if self.include_field_values:
                                field_summary["value"] = field_value
                            json_summary[output_field_name] = field_summary
                            total_fields_failed += 1
                            list_fields_failed.append(output_field_name)
                            continue
                    elif regex_pattern and not re.match(
                        regex_pattern, str(field_value)
                    ):
                        # If allow_unknown is True and the value is "unknown", override regex failure
                        if allow_unknown and isinstance(field_value, str) and field_value.lower() == "unknown":
                            # This case should have been handled above, but just in case
                            pass
                        else:
                            reason = "value does not match the required pattern"
                            regex_failure = True
                            field_summary = {
                                "status": "failure",
                                "description": f"Field exists but {reason}.",
                                "is_missing": is_missing,
                                "is_empty": is_empty,
                                "is_unknown": is_unknown,
                                "regex_failure": regex_failure,
                                "regex_expression": regex_pattern,
                            }
                            if self.include_field_values:
                                field_summary["value"] = field_value
                            json_summary[output_field_name] = field_summary
                            total_fields_failed += 1
                            list_fields_failed.append(output_field_name)
                            continue

                # Mark as success if field exists, has a value, is not 'unknown', and regex matches (if specified)
                field_summary = {
                    "status": "success",
                    "description": "Field exists and is valid.",
                    "is_missing": is_missing,
                    "is_empty": is_empty,
                    "is_unknown": is_unknown,
                }
                if regex_pattern:
                    field_summary["regex_failure"] = regex_failure
                    field_summary["regex_expression"] = regex_pattern
                if self.include_field_values:
                    field_summary["value"] = field_value
                json_summary[output_field_name] = field_summary
                total_fields_passed += 1
                list_fields_passed.append(output_field_name)

            # Determine overall status
            overall_status = "success" if total_fields_failed == 0 else "failure"

            # Add summary to JSON
            json_summary[self.summary_fieldname] = {
                "overall_status": overall_status,
                "total_fields_checked": total_fields_checked,
                "total_fields_failed": total_fields_failed,
                "total_fields_passed": total_fields_checked - total_fields_failed,
                "percentage_failed": round(
                    total_fields_failed / total_fields_checked * 100, 2
                ),
                "percentage_passed": round(
                    total_fields_passed / total_fields_checked * 100, 2
                ),
                "list_fields_passed": list_fields_passed,
                "list_fields_failed": list_fields_failed,
            }

            # Modify the JSON dumping based on the pretty_print_json option
            indent_value = 4 if self.pretty_print_json else None
            yield_record["json_summary"] = json.dumps(json_summary, indent=indent_value)

            #
            # output_mode=raw
            #

            if self.output_mode == "raw":
                # for each key value in record, add to yield_record
                for k, v in record.items():
                    yield_record[k] = v

                # add an event_id as the sha256 hash of yield_record
                yield_record["event_id"] = hashlib.sha256(
                    json.dumps(json_summary).encode("utf-8")
                ).hexdigest()

                yield yield_record

            #
            # output_mode=json
            #

            elif self.output_mode == "json":

                metadata_json = {}

                # get event_time
                event_time = float(record.get("_time", time.time()))

                # add the _time (epoch) and the human readable (%c %Z) as time_human
                metadata_json["time_epoch"] = event_time
                metadata_json["time_human"] = datetime.fromtimestamp(
                    event_time,
                    tz=timezone.utc,
                ).strftime("%c %Z")

                # always add index, sourcetype, host, source to the metadata field
                metadata_json["index"] = record.get("index")
                metadata_json["sourcetype"] = record.get("sourcetype")
                metadata_json["host"] = record.get("host")
                metadata_json["source"] = record.get("source")

                #
                # output_mode=json

                # handle metadata_fields
                if self.metadata_fields:
                    metadata_fields_list = self.metadata_fields.split(",")
                    for field in metadata_fields_list:
                        field = field.strip()
                        if field in record and field != "json_summary":
                            metadata_json[field] = record[field]

                # Add the metadata to the json_summary
                json_summary[self.metadata_fieldname] = metadata_json
                event_id = hashlib.sha256(
                    json.dumps(json_summary).encode("utf-8")
                ).hexdigest()
                json_summary["event_id"] = event_id

                # init yield_record
                yield_record = {}

                # handle the yield record
                yield_record["_time"] = event_time
                yield_record["_raw"] = json_summary
                yield_record["json_summary"] = json_summary

                # always add index, sourcetype, host, source to the main results
                yield_record["index"] = record.get("index")
                yield_record["sourcetype"] = record.get("sourcetype")
                yield_record["host"] = record.get("host")
                yield_record["source"] = record.get("source")

                # finally yield the record
                yield yield_record

        # Log the run time
        logging.info(
            f'context="perf", trackmefieldsquality has terminated, records_count="{records_count}", run_time="{round((time.time() - start), 3)}"'
        )


dispatch(TrackMeFieldsQuality, sys.argv, sys.stdin, sys.stdout, __name__)