#!/usr/bin/env python # coding=utf-8 __author__ = "TrackMe Limited" __copyright__ = "Copyright 2022-2026, TrackMe Limited, U.K." __credits__ = "TrackMe Limited, U.K." __license__ = "TrackMe Limited, all rights reserved" __version__ = "0.1.0" __maintainer__ = "TrackMe Limited, U.K." __email__ = "support@trackme-solutions.com" __status__ = "PRODUCTION" import os import sys import time import json import logging from logging.handlers import RotatingFileHandler import urllib3 import hashlib urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) splunkhome = os.environ["SPLUNK_HOME"] # set logging filehandler = RotatingFileHandler( "%s/var/log/splunk/trackme_trackmepushdatasource.log" % splunkhome, mode="a", maxBytes=10000000, backupCount=1, ) formatter = logging.Formatter( "%(asctime)s %(levelname)s %(filename)s %(funcName)s %(lineno)d %(message)s" ) logging.Formatter.converter = time.gmtime filehandler.setFormatter(formatter) log = logging.getLogger() # root logger - Good to get it only once. for hdlr in log.handlers[:]: # remove the existing file handlers if isinstance(hdlr, logging.FileHandler): log.removeHandler(hdlr) log.addHandler(filehandler) # set the new handler # set the log level to INFO, DEBUG as the default is ERROR log.setLevel(logging.INFO) # append current directory sys.path.append(os.path.dirname(os.path.abspath(__file__))) # import libs import import_declare_test # import Splunk libs from splunklib.searchcommands import ( dispatch, StreamingCommand, Configuration, Option, validators, ) # import trackme libs from trackme_libs import ( trackme_reqinfo, run_splunk_search, ) # import trackme libs utils from trackme_libs_utils import remove_leading_spaces, decode_unicode # import trackme libs get data from trackme_libs_get_data import get_full_kv_collection_by_object @Configuration(distributed=False) class TrackMePushDataSource(StreamingCommand): tenant_id = Option( doc=""" **Syntax:** **tenant_id=**** **Description:** The tenant identifier.""", require=True, default=None, ) component = Option( doc=""" **Syntax:** **component=**** **Description:** The component to use (dsm or dhm).""", require=True, validate=validators.Match("component", r"^(dsm|dhm)$"), ) search_type = Option( doc=""" **Syntax:** **search_type=**** **Description:** The type of search to perform (tstats or raw).""", require=True, validate=validators.Match("search_type", r"^(tstats|raw)$"), ) show_search_query = Option( doc=""" **Syntax:** **show_search_query=**** **Description:** If true, includes the search query in the summary output.""", require=False, default=False, validate=validators.Boolean(), ) show_search_results = Option( doc=""" **Syntax:** **show_search_results=**** **Description:** If true, includes the search results in the summary output.""", require=False, default=False, validate=validators.Boolean(), ) pretend_latest = Option( doc=""" **Syntax:** **pretend_latest=**** **Description:** Relative time value in Splunk format for data_last_time_seen. Default is -24h.""", require=False, default="-24h", validate=validators.Match("pretend_latest", r"^.*$"), ) def stream(self, records): # Start performance counter start = time.time() # Get request info and set logging level reqinfo = trackme_reqinfo( self._metadata.searchinfo.session_key, self._metadata.searchinfo.splunkd_uri ) log.setLevel(reqinfo["logging_level"]) # instance id for logging (random sha256 hash) instance_id = hashlib.sha256( f"{time.time()}{self.tenant_id}{self.component}{self.search_type}{self.show_search_query}{self.show_search_results}{self.pretend_latest}".encode() ).hexdigest() # log start logging.info( f"tenant_id={self.tenant_id}, component={self.component}, instance_id={instance_id}, trackmepushdatasource is starting processing, search_type={self.search_type}, show_search_query={self.show_search_query}, show_search_results={self.show_search_results}, pretend_latest={self.pretend_latest}" ) # Initialize counters and storage records_count = 0 missing_records = [] existing_records = 0 objects_added = [] rejected_records = [] # Get the KV store collection for the tenant kv_collection_name = f"kv_trackme_{self.component}_tenant_{self.tenant_id}" kv_collection = self.service.kvstore[kv_collection_name] # get the full collection try: kv_collection_records, kv_collection_objects, kv_collection_dict = ( get_full_kv_collection_by_object(kv_collection, kv_collection_name) ) except Exception as e: log.error( f"tenant_id={self.tenant_id}, component={self.component}, instance_id={instance_id}, error getting full collection: {str(e)}" ) yield {"error": str(e)} return # Loop through records for record in records: records_count += 1 # Extract required fields try: object_name = decode_unicode(record.get("object")) except Exception as e: object_name = None try: index = decode_unicode(record.get("index")) except Exception as e: index = None try: sourcetype = decode_unicode(record.get("sourcetype")) except Exception as e: sourcetype = None try: host = decode_unicode(record.get("host")) # only for dhm except Exception as e: host = None # lower all of them if object_name: object_name = object_name.lower() if index: index = index.lower() if sourcetype: sourcetype = sourcetype.lower() # orig_host, for dhm only orig_host = None # Validate index and sourcetype based on component type if self.component == "dsm": if not index: log.error( f"Missing required index for DSM component in record: {record}" ) rejected_records.append( { "record": record, "reason": "Missing required index for DSM component", } ) continue if not sourcetype: log.error( f"Missing required sourcetype for DSM component in record: {record}" ) rejected_records.append( { "record": record, "reason": "Missing required sourcetype for DSM component", } ) continue # unless object is specified, for dsm we will compose it as index:sourcetype if not object_name: object_name = f"{index}:{sourcetype}" elif self.component == "dhm": if not host: log.error( f"Missing required host field for DHM component in record: {record}" ) rejected_records.append( { "record": record, "reason": "Missing required host field for DHM component", } ) continue else: # for dhm, the object is the host with the prefix key:host| - if the prefix is not present, add it orig_host = record.get("host") if not host.startswith("key:host|"): host = f"key:host|{host}" object_name = host # Validate sourcetype based on component type if self.component == "dsm": if not sourcetype: log.error( f"Missing required sourcetype for DSM component in record: {record}" ) rejected_records.append( { "record": record, "reason": "Missing required sourcetype for DSM component", } ) continue elif self.component == "dhm": # Validate index and sourcetype based on component type if index: # if index is a list, convert it to CSV string if isinstance(index, list): index = ",".join(index) # If index is not a list, keep it as is record["index"] = index if sourcetype: # If sourcetype is a list, convert it to CSV string if isinstance(sourcetype, list): sourcetype = ",".join(sourcetype) # If sourcetype is not a list, keep it as is record["sourcetype"] = sourcetype # Check if object exists in KV store if object_name not in kv_collection_objects: # Object doesn't exist, add to missing records record_to_add = {} if self.component == "dsm": record_to_add["object"] = object_name record_to_add["index"] = index record_to_add["sourcetype"] = sourcetype if self.component == "dhm": host = record.get("host") # if host does not start with key:host|, add it and make it lower case if not host.startswith("key:host|"): host = f"key:host|{host}".lower() record_to_add["host"] = host record_to_add["alias"] = ( orig_host # alias should be defined for dhm ) # index and sourcetype are optional for dhm if index: record_to_add["index"] = index if sourcetype: record_to_add["sourcetype"] = sourcetype missing_records.append(record_to_add) logging.info( f"tenant_id={self.tenant_id}, component={self.component}, instance_id={instance_id}, collection={kv_collection_name}, Adding record to missing records: {json.dumps(record_to_add)}" ) else: existing_records += 1 # Process missing records if any if missing_records: try: # Create the search string data_strings = [] for record in missing_records: if self.component == "dsm": data_strings.append( f"\"object\": \"{record['object']}\", \"data_index\": \"{record['index']}\", \"data_sourcetype\": \"{record['sourcetype']}\"" ) elif self.component == "dhm": host = record.get("host") alias = record.get("alias") index = record.get("index", "") sourcetype = record.get("sourcetype", "") data_strings.append( f'"index": "{index}", "sourcetype": "{sourcetype}", ' f'"host": "{host}", ' f'"alias": "{alias}", ' f'"data_eventcount": 0, ' f'"avg_eventcount_5m": 0, "latest_eventcount_5m": 0, "perc95_eventcount_5m": 0, ' f'"avg_latency_5m": 0, "latest_latency_5m": 0, "perc95_latency_5m": 0, ' f'"stdev_latency_5m": 0, "stdev_eventcount_5m": 0, ' f'"data_first_time_seen": 0, ' f'"data_last_ingestion_lag_seen": 0' ) # escape double quotes in data_strings data_strings = [ data_string.replace('"', '\\"') for data_string in data_strings ] search_query = remove_leading_spaces( f""" | makeresults | eval data = "{'#'.join(data_strings)}" | eval data=split(data, "#") | mvexpand data | eval data = "{{" . data . "}}" | fields - _time | spath input=data | fields - data | eval data_last_time_seen=relative_time(now(), "{self.pretend_latest}"), data_last_ingest=relative_time(now(), "{self.pretend_latest}") {f"| " if self.component == "dhm" else ""}`trackme_{self.component}_tracker_abstract({self.tenant_id}, {self.search_type})` | `trackme_outputlookup(trackme_{self.component}_tenant_{self.tenant_id}, key)` """ ).strip() # Execute the search using run_splunk_search kwargs = { "earliest_time": "-5m", "latest_time": "now", "count": 0, "output_mode": "json", } search_results = [] try: reader = run_splunk_search( self.service, search_query, kwargs, 24, # max_retries 5, # retry_delay ) for item in reader: if isinstance(item, dict): logging.debug(f'search_results="{item}"') search_results.append(item) objects_added.append(item.get("object")) # Add summary to the output yield_summary = { "total_records_processed": records_count, "existing_records": existing_records, "new_records_added": len(missing_records), "objects_added": objects_added, "rejected_records": rejected_records, "processing_time": round(time.time() - start, 2), } # Add search query to summary if requested if self.show_search_query: yield_summary["search_query"] = search_query # Add search results to summary if requested if self.show_search_results: yield_summary["search_results"] = search_results yield yield_summary except Exception as e: log.error(f"Error executing search: {str(e)}") yield {"error": str(e), "search": search_query} except Exception as e: log.error(f"Error preparing search: {str(e)}") yield {"error": str(e)} else: # No missing records, just return summary yield_summary = { "total_records_processed": records_count, "existing_records": existing_records, "new_records_added": 0, "objects_added": [], "rejected_records": rejected_records, "processing_time": round(time.time() - start, 2), } yield yield_summary # log end logging.info( f"tenant_id={self.tenant_id}, component={self.component}, instance_id={instance_id}, trackmepushdatasource processing completed, records_count={records_count}, existing_records={existing_records}, new_records_added={len(missing_records)}, objects_added={objects_added}, rejected_records={rejected_records}, processing_time={round(time.time() - start, 2)}" ) dispatch(TrackMePushDataSource, sys.argv, sys.stdin, sys.stdout, __name__)