Splunk_Deploiement/apps/trackme/bin/trackmefieldsqualityextract.py

#!/usr/bin/env python
# coding=utf-8

__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"

import os
import sys
import time
import json
import logging
from logging.handlers import RotatingFileHandler
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

splunkhome = os.environ["SPLUNK_HOME"]

# set logging
filehandler = RotatingFileHandler(
    "%s/var/log/splunk/trackme_trackmefieldsqualityextract.log" % splunkhome,
    mode="a",
    maxBytes=10000000,
    backupCount=1,
)
formatter = logging.Formatter(
    "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger()  # root logger - Good to get it only once.
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr, logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)  # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)

# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# import libs
import import_declare_test

# import Splunk libs
from splunklib.searchcommands import (
    dispatch,
    StreamingCommand,
    Configuration,
    Option,
    validators,
)

# import trackme libs
from trackme_libs import (
    trackme_reqinfo,
)

# import trackme licensing libs
from trackme_libs_licensing import trackme_check_license


@Configuration(distributed=False)
class TrackMeFieldsQualityExtract(StreamingCommand):

    input_field = Option(
        doc="""
        **Syntax:** **input_field=****
        **Description:** The field containing the JSON data to extract. Default is '_raw'.
        """,
        require=False,
        default="_raw",
        validate=validators.Match("input_field", r"^.*$"),
    )

    metadata_fieldname = Option(
        doc="""
        **Syntax:** **metadata_fieldname=****
        **Description:** The name of the metadata field in the JSON. Default is 'metadata'.
        """,
        require=False,
        default="metadata",
        validate=validators.Match("metadata_fieldname", r"^.*$"),
    )

    def stream(self, records):

        # Start performance counter
        start = time.time()

        # Get request info and set logging level
        reqinfo = trackme_reqinfo(
            self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
        )
        log.setLevel(reqinfo["logging_level"])

        # check license state
        try:
            check_license = trackme_check_license(
                reqinfo["server_rest_uri"], self._metadata.searchinfo.session_key
            )
            license_is_valid = check_license.get("license_is_valid")
            logging.debug(
                f'function check_license called, response="{json.dumps(check_license, indent=2)}"'
            )

        except Exception as e:
            license_is_valid = 0
            logging.error(f'function check_license exception="{str(e)}"')

        # check restricted components
        if license_is_valid != 1:
            logging.error(
                f'The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, check_license="{json.dumps(check_license, indent=2)}"'
            )
            raise Exception(
                f"The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, please contact your Splunk administrator."
            )

        # Loop in the results
        records_count = 0
        for record in records:
            records_count += 1

            # Get the JSON data from the input field
            json_data = record.get(self.input_field)

            if not json_data:
                log.warning(f"No data found in field '{self.input_field}'")
                continue

            try:
                # Parse the JSON data
                if isinstance(json_data, str):
                    data = json.loads(json_data)
                else:
                    data = json_data

                # Extract metadata
                metadata = data.get(self.metadata_fieldname, {})

                # Process each field in the data
                for field_name, field_data in data.items():
                    # Skip metadata, summary, and time fields
                    if field_name in [
                        self.metadata_fieldname,
                        "summary",
                        "time",
                        "event_id",
                    ]:
                        continue

                    # Skip if field_data is not a dictionary (should be field quality data)
                    if not isinstance(field_data, dict):
                        continue

                    # Create a new record for this field
                    yield_record = {}

                    # Add time information
                    if "time" in data:
                        yield_record["_time"] = data["time"]
                    else:
                        yield_record["_time"] = time.time()

                    # Add all metadata fields with prefix
                    for meta_key, meta_value in metadata.items():
                        # if is a list, only consider the first element
                        if isinstance(meta_value, list):
                            meta_value = meta_value[0]
                        yield_record[f"metadata.{meta_key}"] = meta_value

                    # Add field name
                    yield_record["fieldname"] = field_name

                    # Add all field quality data
                    for field_key, field_value in field_data.items():
                        yield_record[field_key] = field_value

                    # Add event_id if available
                    if "event_id" in data:
                        yield_record["event_id"] = data["event_id"]

                    # create _raw field
                    yield_record["_raw"] = json.dumps(yield_record)

                    yield yield_record

            except json.JSONDecodeError as e:
                log.error(
                    f"Failed to parse JSON from field '{self.input_field}': {str(e)}"
                )
                continue
            except Exception as e:
                log.error(f"Error processing record: {str(e)}")
                continue

        # Log the run time
        logging.info(
            f'context="perf", trackmefieldsqualityextract has terminated, records_count="{records_count}", run_time="{round((time.time() - start), 3)}"'
        )


dispatch(TrackMeFieldsQualityExtract, sys.argv, sys.stdin, sys.stdout, __name__)