You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/trackme/bin/trackmefieldsquality.py

737 lines
32 KiB

#!/usr/bin/env python
# coding=utf-8
__author__ = "TrackMe Limited"
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
__credits__ = "TrackMe Limited, U.K."
__license__ = "TrackMe Limited, all rights reserved"
__version__ = "0.1.0"
__maintainer__ = "TrackMe Limited, U.K."
__email__ = "support@trackme-solutions.com"
__status__ = "PRODUCTION"
import os
import sys
import time
from datetime import datetime, timezone
import json
import logging
import re
from logging.handlers import RotatingFileHandler
import urllib3
import hashlib
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
splunkhome = os.environ["SPLUNK_HOME"]
# set logging
filehandler = RotatingFileHandler(
"%s/var/log/splunk/trackme_trackmefieldsquality.log" % splunkhome,
mode="a",
maxBytes=10000000,
backupCount=1,
)
formatter = logging.Formatter(
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
)
logging.Formatter.converter = time.gmtime
filehandler.setFormatter(formatter)
log = logging.getLogger() # root logger - Good to get it only once.
for hdlr in log.handlers[:]: # remove the existing file handlers
if isinstance(hdlr, logging.FileHandler):
log.removeHandler(hdlr)
log.addHandler(filehandler) # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.INFO)
# append current directory
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# import libs
import import_declare_test
# import Splunk libs
from splunklib.searchcommands import (
dispatch,
StreamingCommand,
Configuration,
Option,
validators,
)
# import trackme libs
from trackme_libs import (
trackme_reqinfo,
run_splunk_search,
)
# import trackme licensing libs
from trackme_libs_licensing import trackme_check_license
# import trackme libs utils
from trackme_libs_utils import remove_leading_spaces, strict_interpret_boolean
@Configuration(distributed=False)
class TrackMeFieldsQuality(StreamingCommand):
fields_to_check_list = Option(
doc="""
**Syntax:** **fields_to_check_list=****
**Description:** The list of fields to verified, provided as an argument to the command in a comma separated list.""",
require=False,
default=None,
validate=validators.Match("fields_to_check_list", r"^.*$"),
)
fields_to_check_search_command = Option(
doc="""
**Syntax:** **fields_to_check_search_command=****
**Description:** The search command to use to generate the dictionary of fields to check.""",
require=False,
default=None,
validate=validators.Match("fields_to_check_search_command", r"^.*$"),
)
fields_to_check_fieldname = Option(
doc="""
**Syntax:** **fields_to_check_fieldname=****
**Description:** Alternatively, the name of the field containing the list of fields to check, provided in a comma separated list.""",
require=False,
default=None,
validate=validators.Match("fields_to_check_fieldname", r"^.*$"),
)
fields_to_check_dict = Option(
doc="""
**Syntax:** **fields_to_check_dict=****
**Description:** A JSON string containing a dictionary of fields to check with optional regex patterns and validation settings.
Example: {"field1": {"name": "field1", "regex": "^[A-Z]+$", "allow_unknown": true, "allow_empty_or_missing": false}, "field2": {"name": "field2"}}""",
require=False,
default=None,
validate=validators.Match("fields_to_check_dict", r"^.*$"),
)
fields_to_check_dict_path = Option(
doc="""
**Syntax:** **fields_to_check_dict_path=****
**Description:** Path to a JSON file containing a dictionary of fields to check with optional regex patterns and validation settings.
Example: $SPLUNK_HOME/etc/apps/trackme/lookups/fields_config.json""",
require=False,
default=None,
validate=validators.Match("fields_to_check_dict_path", r"^.*$"),
)
fields_to_check_dict_fieldname = Option(
doc="""
**Syntax:** **fields_to_check_dict_fieldname=****
**Description:** The name of the field containing a JSON string with a dictionary of fields to check with optional regex patterns and validation settings.
""",
require=False,
default=None,
validate=validators.Match("fields_to_check_dict_fieldname", r"^.*$"),
)
include_field_values = Option(
doc="""
**Syntax:** **include_field_values=****
**Description:** Boolean option to include field values in the JSON summary.
""",
require=False,
default=False,
validate=validators.Boolean(),
)
pretty_print_json = Option(
doc="""
**Syntax:** **pretty_print_json=****
**Description:** Boolean option to pretty print the JSON summary. Default is True.
""",
require=False,
default=True,
validate=validators.Boolean(),
)
output_mode = Option(
doc="""
**Syntax:** **output_mode=****
**Description:** The mode to output the results. Default is json, valid options are json and raw.
""",
require=False,
default="json",
validate=validators.Match("output_mode", r"^json|raw$"),
)
metadata_fields = Option(
doc="""
**Syntax:** **metadata_fields=****
**Description:** A CSV list of metadata fields to include in the metadata section of the JSON when using output_mode=json. index/sourcetype/host/source are always included, you can add others to be included in the metadata section.
""",
require=False,
default="index,sourcetype,host,source",
validate=validators.Match("metadata_fields", r"^.*$"),
)
summary_fieldname = Option(
doc="""
**Syntax:** **summary_fieldname=****
**Description:** Defines the name of the summary field. Default is 'summary'.
""",
require=False,
default="summary",
validate=validators.Match("summary_fieldname", r"^.*$"),
)
metadata_fieldname = Option(
doc="""
**Syntax:** **metadata_fieldname=****
**Description:** Defines the name of the metadata field added to the summary JSON. Default is 'metadata'.
""",
require=False,
default="metadata",
validate=validators.Match("metadata_fieldname", r"^.*$"),
)
# status will be statically defined as imported
def stream(self, records):
# Start performance counter
start = time.time()
# Get request info and set logging level
reqinfo = trackme_reqinfo(
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
)
log.setLevel(reqinfo["logging_level"])
# check license state
try:
check_license = trackme_check_license(
reqinfo["server_rest_uri"], self._metadata.searchinfo.session_key
)
license_is_valid = check_license.get("license_is_valid")
logging.debug(
f'function check_license called, response="{json.dumps(check_license, indent=2)}"'
)
except Exception as e:
license_is_valid = 0
logging.error(f'function check_license exception="{str(e)}"')
# check restricted components
if license_is_valid != 1:
logging.error(
f'The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, check_license="{json.dumps(check_license, indent=2)}"'
)
raise Exception(
f"The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, please contact your Splunk administrator."
)
# either fields_to_check_list or fields_to_check_fieldname must be provided, but not both
if (
sum(
1
for x in [
self.fields_to_check_list,
self.fields_to_check_fieldname,
self.fields_to_check_dict,
self.fields_to_check_dict_path,
self.fields_to_check_dict_fieldname,
self.fields_to_check_search_command,
]
if x
)
> 1
):
raise ValueError(
"Only one of fields_to_check_list, fields_to_check_fieldname, fields_to_check_dict, fields_to_check_dict_path, fields_to_check_dict_fieldname or fields_to_check_search_command can be provided"
)
# if fields_to_check_search_command is provided, run the search command, load the json_dict field from the results and use it as the fields_to_check_dict
json_dict = None
if self.fields_to_check_search_command:
try:
reader = run_splunk_search(
self.service,
remove_leading_spaces(self.fields_to_check_search_command),
{
"earliest_time": "-5m",
"latest_time": "now",
"preview": "false",
"output_mode": "json",
"count": 0,
},
24,
5,
)
for item in reader:
if isinstance(item, dict):
# raise an exception if the json_dict field is not present
if "json_dict" not in item:
raise ValueError(
f"json_dict field not found in the search results for search command: {self.fields_to_check_search_command}"
)
# load the json_dict field
json_dict = json.loads(item["json_dict"])
# only one result is expected
break
except Exception as e:
error_msg = f'context="error", trackmefieldsquality has failed with exception="{str(e)}"'
logging.error(error_msg)
raise Exception(error_msg)
# Loop in the results
records_count = 0
for record in records:
records_count += 1
yield_record = {}
json_summary = {"time": float(record.get("_time", time.time()))}
# Get the list of fields from fields_to_check_list
if self.fields_to_check_list:
fields_to_check = self.fields_to_check_list.split(",")
fields_dict = {
field.strip(): {"name": field.strip()} for field in fields_to_check
}
# Get the list of fields from fields_to_check_fieldname
elif self.fields_to_check_fieldname:
fields_to_check = record.get(self.fields_to_check_fieldname)
# check if fields_to_check is a list, if so keep the first item only
if isinstance(fields_to_check, list):
fields_to_check = fields_to_check[0]
fields_to_check = fields_to_check.split(",")
fields_dict = {
field.strip(): {"name": field.strip()} for field in fields_to_check
}
# Get fields from fields_to_check_dict
elif self.fields_to_check_dict:
try:
fields_dict = json.loads(self.fields_to_check_dict)
# Validate the structure
for field_name, field_info in fields_dict.items():
if not isinstance(field_info, dict):
raise ValueError(f"Field {field_name} must be a dictionary")
if "name" not in field_info:
raise ValueError(
f"Field {field_name} must have a 'name' property"
)
if not isinstance(field_info["name"], str):
raise ValueError(
f"Field {field_name} name must be a string"
)
if "regex" in field_info and not isinstance(
field_info["regex"], str
):
raise ValueError(
f"Field {field_name} regex must be a string if provided"
)
except json.JSONDecodeError:
raise ValueError(
f"Invalid JSON format in fields_to_check_dict: {self.fields_to_check_dict}"
)
# Get fields from fields_to_check_dict_path
elif self.fields_to_check_dict_path:
try:
# Handle relative paths from SPLUNK_HOME
if not os.path.isabs(self.fields_to_check_dict_path):
file_path = os.path.join(
splunkhome, self.fields_to_check_dict_path
)
else:
file_path = self.fields_to_check_dict_path
if not os.path.exists(file_path):
raise ValueError(f"JSON file not found: {file_path}")
with open(file_path, "r") as f:
fields_dict = json.load(f)
# Validate the structure
for field_name, field_info in fields_dict.items():
if not isinstance(field_info, dict):
raise ValueError(f"Field {field_name} must be a dictionary")
if "name" not in field_info:
raise ValueError(
f"Field {field_name} must have a 'name' property"
)
if not isinstance(field_info["name"], str):
raise ValueError(
f"Field {field_name} name must be a string"
)
if "regex" in field_info and not isinstance(
field_info["regex"], str
):
raise ValueError(
f"Field {field_name} regex must be a string if provided"
)
except json.JSONDecodeError:
raise ValueError(
f"Invalid JSON format in file: {self.fields_to_check_dict_path}"
)
except IOError as e:
raise ValueError(f"Error reading JSON file: {str(e)}")
# Get fields from fields_to_check_dict_fieldname
elif self.fields_to_check_dict_fieldname:
try:
json_string = record.get(self.fields_to_check_dict_fieldname)
# check if json_string is a list, if so keep the first item only
if isinstance(json_string, list):
json_string = json_string[0]
fields_dict = json.loads(json_string)
# Validate the structure
for field_name, field_info in fields_dict.items():
if not isinstance(field_info, dict):
raise ValueError(f"Field {field_name} must be a dictionary")
if "name" not in field_info:
raise ValueError(
f"Field {field_name} must have a 'name' property"
)
if not isinstance(field_info["name"], str):
raise ValueError(
f"Field {field_name} name must be a string"
)
if "regex" in field_info and not isinstance(
field_info["regex"], str
):
raise ValueError(
f"Field {field_name} regex must be a string if provided"
)
except json.JSONDecodeError:
raise ValueError(
f"Invalid JSON format in fields_to_check_dict_fieldname with field_name: {self.fields_to_check_dict_fieldname} and json_string: {json_string}"
)
elif self.fields_to_check_search_command:
fields_dict = json_dict
else:
fields_dict = {}
# Initialize counters for summary
total_fields_checked = 0
total_fields_failed = 0
total_fields_passed = 0
list_fields_passed = []
list_fields_failed = []
# Check each field in the dictionary
for field_info in fields_dict.values():
field = field_info["name"]
# Handle reserved fields by renaming them to orig_<field_name>
# This prevents conflicts with internal reserved fields used for processing output
reserved_fields = ["metadata", "event_id", "summary"]
output_field_name = field
if field in reserved_fields:
output_field_name = f"orig_{field}"
logging.info(f'context="reserved_field", field="{field}" renamed to "{output_field_name}"')
regex_pattern = field_info.get("regex")
allow_unknown = strict_interpret_boolean(field_info.get("allow_unknown", False))
allow_empty_or_missing = strict_interpret_boolean(field_info.get("allow_empty_or_missing", False))
field_value = record.get(field)
total_fields_checked += 1
# Initialize flags
is_missing = field_value is None
is_empty = False
is_unknown = False
regex_failure = False
# Check if field is missing and allow_empty_or_missing is True
if is_missing:
if allow_empty_or_missing:
field_summary = {
"status": "success",
"description": "Field does not exist but is allowed to be missing.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_passed += 1
list_fields_passed.append(output_field_name)
continue
else:
reason = "does not exist"
field_summary = {
"status": "failure",
"description": f"Field {reason}.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_failed += 1
list_fields_failed.append(output_field_name)
continue
if isinstance(field_value, list):
# Check each item in the list
all_items_valid = True
for item in field_value:
if isinstance(item, str) and item.lower() == "unknown":
if not allow_unknown:
all_items_valid = False
reason = "contains 'unknown'"
is_unknown = True
break
elif regex_pattern and not re.match(regex_pattern, str(item)):
# If allow_unknown is True and the value is "unknown", override regex failure
if allow_unknown and isinstance(item, str) and item.lower() == "unknown":
continue
all_items_valid = False
reason = "one or more values in the list do not match the required pattern"
regex_failure = True
break
if not all_items_valid:
field_summary = {
"status": "failure",
"description": f"Field exists but {reason}.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_failed += 1
list_fields_failed.append(output_field_name)
continue
else:
# Original behavior for non-list values
if field_value == "":
if allow_empty_or_missing:
field_summary = {
"status": "success",
"description": "Field is empty but is allowed to be empty.",
"is_missing": is_missing,
"is_empty": True,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_passed += 1
list_fields_passed.append(output_field_name)
continue
else:
reason = "is empty"
is_empty = True
field_summary = {
"status": "failure",
"description": f"Field {reason}.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_failed += 1
list_fields_failed.append(output_field_name)
continue
elif (
isinstance(field_value, str)
and field_value.lower() == "unknown"
):
if not allow_unknown:
reason = "is 'unknown'"
is_unknown = True
field_summary = {
"status": "failure",
"description": f"Field {reason}.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_failed += 1
list_fields_failed.append(output_field_name)
continue
elif regex_pattern and not re.match(
regex_pattern, str(field_value)
):
# If allow_unknown is True and the value is "unknown", override regex failure
if allow_unknown and isinstance(field_value, str) and field_value.lower() == "unknown":
# This case should have been handled above, but just in case
pass
else:
reason = "value does not match the required pattern"
regex_failure = True
field_summary = {
"status": "failure",
"description": f"Field exists but {reason}.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
"regex_failure": regex_failure,
"regex_expression": regex_pattern,
}
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_failed += 1
list_fields_failed.append(output_field_name)
continue
# Mark as success if field exists, has a value, is not 'unknown', and regex matches (if specified)
field_summary = {
"status": "success",
"description": "Field exists and is valid.",
"is_missing": is_missing,
"is_empty": is_empty,
"is_unknown": is_unknown,
}
if regex_pattern:
field_summary["regex_failure"] = regex_failure
field_summary["regex_expression"] = regex_pattern
if self.include_field_values:
field_summary["value"] = field_value
json_summary[output_field_name] = field_summary
total_fields_passed += 1
list_fields_passed.append(output_field_name)
# Determine overall status
overall_status = "success" if total_fields_failed == 0 else "failure"
# Add summary to JSON
json_summary[self.summary_fieldname] = {
"overall_status": overall_status,
"total_fields_checked": total_fields_checked,
"total_fields_failed": total_fields_failed,
"total_fields_passed": total_fields_checked - total_fields_failed,
"percentage_failed": round(
total_fields_failed / total_fields_checked * 100, 2
),
"percentage_passed": round(
total_fields_passed / total_fields_checked * 100, 2
),
"list_fields_passed": list_fields_passed,
"list_fields_failed": list_fields_failed,
}
# Modify the JSON dumping based on the pretty_print_json option
indent_value = 4 if self.pretty_print_json else None
yield_record["json_summary"] = json.dumps(json_summary, indent=indent_value)
#
# output_mode=raw
#
if self.output_mode == "raw":
# for each key value in record, add to yield_record
for k, v in record.items():
yield_record[k] = v
# add an event_id as the sha256 hash of yield_record
yield_record["event_id"] = hashlib.sha256(
json.dumps(json_summary).encode("utf-8")
).hexdigest()
yield yield_record
#
# output_mode=json
#
elif self.output_mode == "json":
metadata_json = {}
# get event_time
event_time = float(record.get("_time", time.time()))
# add the _time (epoch) and the human readable (%c %Z) as time_human
metadata_json["time_epoch"] = event_time
metadata_json["time_human"] = datetime.fromtimestamp(
event_time,
tz=timezone.utc,
).strftime("%c %Z")
# always add index, sourcetype, host, source to the metadata field
metadata_json["index"] = record.get("index")
metadata_json["sourcetype"] = record.get("sourcetype")
metadata_json["host"] = record.get("host")
metadata_json["source"] = record.get("source")
#
# output_mode=json
# handle metadata_fields
if self.metadata_fields:
metadata_fields_list = self.metadata_fields.split(",")
for field in metadata_fields_list:
field = field.strip()
if field in record and field != "json_summary":
metadata_json[field] = record[field]
# Add the metadata to the json_summary
json_summary[self.metadata_fieldname] = metadata_json
event_id = hashlib.sha256(
json.dumps(json_summary).encode("utf-8")
).hexdigest()
json_summary["event_id"] = event_id
# init yield_record
yield_record = {}
# handle the yield record
yield_record["_time"] = event_time
yield_record["_raw"] = json_summary
yield_record["json_summary"] = json_summary
# always add index, sourcetype, host, source to the main results
yield_record["index"] = record.get("index")
yield_record["sourcetype"] = record.get("sourcetype")
yield_record["host"] = record.get("host")
yield_record["source"] = record.get("source")
# finally yield the record
yield yield_record
# Log the run time
logging.info(
f'context="perf", trackmefieldsquality has terminated, records_count="{records_count}", run_time="{round((time.time() - start), 3)}"'
)
dispatch(TrackMeFieldsQuality, sys.argv, sys.stdin, sys.stdout, __name__)