You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
296 lines
11 KiB
296 lines
11 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
|
|
__author__ = "TrackMe Limited"
|
|
__copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K."
|
|
__credits__ = "TrackMe Limited, U.K."
|
|
__license__ = "TrackMe Limited, all rights reserved"
|
|
__version__ = "0.1.0"
|
|
__maintainer__ = "TrackMe Limited, U.K."
|
|
__email__ = "support@trackme-solutions.com"
|
|
__status__ = "PRODUCTION"
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import logging
|
|
from logging.handlers import RotatingFileHandler
|
|
import urllib3
|
|
from collections import defaultdict, Counter
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
splunkhome = os.environ["SPLUNK_HOME"]
|
|
|
|
# set logging
|
|
filehandler = RotatingFileHandler(
|
|
"%s/var/log/splunk/trackme_trackmefieldsqualitygensummary.log" % splunkhome,
|
|
mode="a",
|
|
maxBytes=10000000,
|
|
backupCount=1,
|
|
)
|
|
formatter = logging.Formatter(
|
|
"%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s"
|
|
)
|
|
logging.Formatter.converter = time.gmtime
|
|
filehandler.setFormatter(formatter)
|
|
log = logging.getLogger() # root logger - Good to get it only once.
|
|
for hdlr in log.handlers[:]: # remove the existing file handlers
|
|
if isinstance(hdlr, logging.FileHandler):
|
|
log.removeHandler(hdlr)
|
|
log.addHandler(filehandler) # set the new handler
|
|
# set the log level to INFO, DEBUG as the default is ERROR
|
|
log.setLevel(logging.INFO)
|
|
|
|
# append current directory
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# import libs
|
|
import import_declare_test
|
|
|
|
# import Splunk libs
|
|
from splunklib.searchcommands import (
|
|
dispatch,
|
|
StreamingCommand,
|
|
Configuration,
|
|
Option,
|
|
validators,
|
|
)
|
|
|
|
# import trackme libs
|
|
from trackme_libs import (
|
|
trackme_reqinfo,
|
|
)
|
|
|
|
# Helper to ensure hashable group keys
|
|
|
|
def make_hashable(value):
|
|
if isinstance(value, list):
|
|
# return the first element of the list
|
|
return make_hashable(value[0])
|
|
elif isinstance(value, dict):
|
|
return tuple(sorted((k, make_hashable(v)) for k, v in value.items()))
|
|
else:
|
|
return value
|
|
|
|
# import trackme licensing libs
|
|
from trackme_libs_licensing import trackme_check_license
|
|
|
|
|
|
@Configuration(distributed=False)
|
|
class TrackMeFieldsQualityGenSummary(StreamingCommand):
|
|
|
|
maxvals = Option(
|
|
doc="""
|
|
**Syntax:** **maxvals=****
|
|
**Description:** Max number of distinct values to report in field_values.
|
|
""",
|
|
require=False,
|
|
default=15,
|
|
validate=validators.Match("maxvals", r"^.*$"),
|
|
)
|
|
|
|
fieldvalues_format = Option(
|
|
doc="""
|
|
**Syntax:** **fieldvalues_format, either list or csv=****
|
|
**Description:** Format of field_values.
|
|
""",
|
|
require=False,
|
|
default="csv",
|
|
validate=validators.Match("fieldvalues_format", r"^(list|csv)$"),
|
|
)
|
|
|
|
groupby_metadata_fields = Option(
|
|
doc="""
|
|
**Syntax:** **groupby_metadata_fields=field1,field2,...**
|
|
**Description:** Comma-separated list of metadata fields to group by in addition to fieldname.
|
|
""",
|
|
require=False,
|
|
default="",
|
|
validate=validators.Match("groupby_metadata_fields", r"^.*$"),
|
|
)
|
|
|
|
def stream(self, records):
|
|
|
|
# Start performance counter
|
|
start = time.time()
|
|
|
|
# Get request info and set logging level
|
|
reqinfo = trackme_reqinfo(
|
|
self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri
|
|
)
|
|
log.setLevel(reqinfo["logging_level"])
|
|
|
|
# check license state
|
|
try:
|
|
check_license = trackme_check_license(
|
|
reqinfo["server_rest_uri"], self._metadata.searchinfo.session_key
|
|
)
|
|
license_is_valid = check_license.get("license_is_valid")
|
|
logging.debug(
|
|
f'function check_license called, response="{json.dumps(check_license, indent=2)}"'
|
|
)
|
|
|
|
except Exception as e:
|
|
license_is_valid = 0
|
|
logging.error(f'function check_license exception="{str(e)}"')
|
|
|
|
# check restricted components
|
|
if license_is_valid != 1:
|
|
logging.error(
|
|
f'The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, check_license="{json.dumps(check_license, indent=2)}"'
|
|
)
|
|
raise Exception(
|
|
f"The requested component is restricted to the Full and Trial edition mode, its execution cannot be accepted, please contact your Splunk administrator."
|
|
)
|
|
|
|
# Parse groupby_metadata_fields argument
|
|
groupby_metadata_fields = []
|
|
if hasattr(self, "groupby_metadata_fields") and self.groupby_metadata_fields:
|
|
groupby_metadata_fields = [
|
|
f.strip() for f in self.groupby_metadata_fields.split(",") if f.strip()
|
|
]
|
|
# if fieldname is in the list, remove it
|
|
if "fieldname" in groupby_metadata_fields:
|
|
groupby_metadata_fields.remove("fieldname")
|
|
|
|
# Initialize data structures to aggregate by fieldname + metadata fields
|
|
field_data = defaultdict(
|
|
lambda: {
|
|
"values": Counter(),
|
|
"total_count": 0,
|
|
"non_empty_count": 0,
|
|
"metadata": {},
|
|
"meta_key_values": {},
|
|
}
|
|
)
|
|
|
|
# Loop through all records to collect data
|
|
records_count = 0
|
|
for record in records:
|
|
records_count += 1
|
|
|
|
# Extract required fields
|
|
fieldname = record.get("fieldname")
|
|
value = record.get("value")
|
|
event_time = record.get("_time")
|
|
regex_expression = record.get("regex_expression")
|
|
|
|
if not fieldname:
|
|
log.warning(f"Record {records_count} missing fieldname")
|
|
continue
|
|
|
|
# Build the group key: (fieldname, metadata values...)
|
|
meta_key_values = tuple(make_hashable(record.get(f, None)) for f in groupby_metadata_fields)
|
|
group_key = (fieldname,) + meta_key_values
|
|
|
|
# Initialize field data if not exists
|
|
if group_key not in field_data:
|
|
field_data[group_key] = {
|
|
"values": Counter(),
|
|
"total_count": 0,
|
|
"non_empty_count": 0,
|
|
"metadata": {},
|
|
"meta_key_values": {
|
|
f: record.get(f, None) for f in groupby_metadata_fields
|
|
},
|
|
"regex_expression": regex_expression,
|
|
}
|
|
|
|
# Count total events
|
|
field_data[group_key]["total_count"] += 1
|
|
|
|
# Check if value is not null and not empty
|
|
is_non_empty = value is not None and str(value).strip() != ""
|
|
if is_non_empty:
|
|
field_data[group_key]["non_empty_count"] += 1
|
|
# Count the value (convert to string for consistency)
|
|
field_data[group_key]["values"][str(value)] += 1
|
|
|
|
# Store metadata from first record (assuming consistent metadata per field)
|
|
if not field_data[group_key]["metadata"]:
|
|
for key, val in record.items():
|
|
if key not in ["fieldname", "value", "_time", "_raw"]:
|
|
field_data[group_key]["metadata"][key] = val
|
|
|
|
# Generate summary records for each field+metadata combination
|
|
for group_key, data in field_data.items():
|
|
try:
|
|
# Calculate statistics
|
|
total_events = data["total_count"]
|
|
non_empty_count = data["non_empty_count"]
|
|
distinct_value_count = len(data["values"])
|
|
|
|
# Calculate percent coverage
|
|
percent_coverage = (
|
|
(non_empty_count / total_events * 100) if total_events > 0 else 0
|
|
)
|
|
|
|
# Generate field_values string
|
|
field_values_parts = []
|
|
if data["values"]:
|
|
# Sort values by count (descending) and take top maxvals
|
|
sorted_values = data["values"].most_common(int(self.maxvals))
|
|
|
|
for value, count in sorted_values:
|
|
value_percentage = (
|
|
(count / non_empty_count * 100)
|
|
if non_empty_count > 0
|
|
else 0
|
|
)
|
|
field_values_parts.append(f"{value_percentage:.2f}% {value}")
|
|
|
|
field_values = field_values_parts
|
|
if self.fieldvalues_format == "csv":
|
|
field_values = ",".join(field_values_parts)
|
|
|
|
# Create summary record
|
|
yield_record = {
|
|
"fieldname": group_key[0],
|
|
"total_events": total_events,
|
|
"distinct_value_count": distinct_value_count,
|
|
"percent_coverage": round(percent_coverage, 2),
|
|
"field_values": field_values,
|
|
"summary": {
|
|
"fieldname": group_key[0],
|
|
"total_events": total_events,
|
|
"distinct_value_count": distinct_value_count,
|
|
"percent_coverage": round(percent_coverage, 2),
|
|
"field_values": field_values,
|
|
},
|
|
}
|
|
|
|
# Add regex_expression if it exists in the data
|
|
if data.get("regex_expression") is not None:
|
|
yield_record["regex_expression"] = data["regex_expression"]
|
|
|
|
# Add selected metadata fields to output (only if they exist in the group)
|
|
for idx, meta_field in enumerate(groupby_metadata_fields):
|
|
meta_value = group_key[idx + 1]
|
|
if meta_value is not None:
|
|
yield_record[meta_field] = meta_value
|
|
|
|
# Add time from first record if available
|
|
if event_time:
|
|
yield_record["_time"] = event_time
|
|
else:
|
|
yield_record["_time"] = time.time()
|
|
|
|
# add _raw
|
|
yield_record["_raw"] = json.dumps(yield_record)
|
|
|
|
yield yield_record
|
|
|
|
except Exception as e:
|
|
log.error(f"Error processing field '{group_key}': {str(e)}")
|
|
continue
|
|
|
|
# Log the run time
|
|
logging.info(
|
|
f'context="perf", trackmefieldsqualitygensummary has terminated, records_count="{records_count}", fields_processed="{len(field_data)}", run_time="{round((time.time() - start), 3)}"'
|
|
)
|
|
|
|
|
|
dispatch(TrackMeFieldsQualityGenSummary, sys.argv, sys.stdin, sys.stdout, __name__)
|