SH-Deployer/apps/SA-ITOA/lib/ITOA/splunk_search_chunk_protoco...

# Copyright (C) 2005-2025 Splunk Inc. All Rights Reserved.
'''
This module handles getting results in and out of the splunk search protocol
For most things, you'll want read_chunk and write_chunk
'''
import sys
import re
import json
import abc
import time
import os
import csv
import splunk.rest as rest

from queue import Queue, Empty
from threading import Thread
from .setup_logging import ItsiLogger, SEARCH_LOG_FILE, getLogger
from ITOA.itoa_common import get_csv_dict_writer

"""
If you are reading or writing binary data, such as an image, under Windows,
the file must be opened in binary mode.
But unix does not make a distinction between text and binary modes
"""
if sys.platform == "win32":
    import msvcrt
    msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
    msvcrt.setmode(sys.stderr.fileno(), os.O_BINARY)
    msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)


# The conf customsearch stanza
CONF_CUSTOMSEARCH_STANZA = 'customsearch'

# The conf customsearch timeout read
CONF_CUSTOMSEARCH_SETTING_TIMEOUT_READ = 'timeout_read'

# The default customsearch read timeout (in seconds)
CONF_CUSTOMSEARCH_SETTING_TIMEOUT_READ_DEFAULT = 60


def get_itsi_conf_setting(session_key, stanza, setting, logger):
    """
    Reads and returns the given conf setting.

    @param session_key: the session key
    @type session_key: str
    @param stanza: the conf stanza
    @type stanza: str
    @param setting: the conf setting
    @type setting: str
    @returns: the conf setting
    @rtype: any
    """
    try:
        response, content = rest.simpleRequest(
            '/servicesNS/nobody/SA-ITOA/configs/conf-itsi_settings/' + stanza,
            sessionKey=session_key,
            getargs={ 'output_mode': 'json' }
        )
        if response.status == 200:
            entries = json.loads(content).get('entry')
            for entry in entries:
                name = entry.get('name')
                if name != stanza:
                    continue
                settings = entry.get('content', {})
                return settings.get(setting)

    except Exception as e:
        logger.exception(e)

    return None


class SearchChunkProtocol(metaclass=abc.ABCMeta):
    """
    New Search Chunk Protocol 1.0 follows the below steps

    1. splunk send action type "getinfo" with header which hold meta information
    2. Command need to send back method with information like type of search
    3. Once initial hand shake is done, Splunk send data in chunks, search command process it and
       send data back to splunk with {"finished":false}. This can be infinite times
    4. When search is done, it need to send { "finished": true } to splunk to finalize the search
    """

    stdin = sys.stdin
    stdout = sys.stdout
    stderr = sys.stderr

    def __init__(self, output_meta_data, is_send_ack=True, logger=None):
        """
        Initialize. It negotiate getinfo exchange fully and defined search command argument and session key
            self.session_key
            self.args

        @type output_meta_data: dict
        @param output_meta_data: output meta data which need to be send during getinfo exchange phase
                this dict must contain type ( possible values of type would be ['streaming', 'stateful', 'events',
                 'reporting']

        @type is_send_ack: bool
        @param is_send_ack: set this flag to avoid sending write back for get info handshake, if this is false, then
                write_chunk need to be called to complete handshake

        @type logger: object
        @param logger: if logger specified then overwrite the class logger
        @return:
        """
        if not (isinstance(output_meta_data, dict) and 'type' in list(output_meta_data.keys())):
            raise AttributeError("Type is not defined in out_meta_data to do get-exchange phase.")

        if output_meta_data.get('type') not in ['streaming', 'stateful', 'events', 'reporting']:
            raise ValueError('Invalid type value')

        self.logger = logger if logger else getLogger()

        self.timeout_read = CONF_CUSTOMSEARCH_SETTING_TIMEOUT_READ_DEFAULT

        # Phase 0 getinfo exchange phase
        ret = self.read_getinfo(sys.stdin)
        if not ret:
            raise ValueError("Getinfo exchanges does not contain meta data")

        metadata, body = ret

        # Store earliest time
        # let's try to get it at the top level, but I don't think this is ever there...
        earliest_time = metadata.get('earliest_time')

        if metadata.get('searchinfo'):
            searchinfo = metadata.get('searchinfo', {})
            if not earliest_time:
                earliest_time = searchinfo.get('earliest_time')
            self.session_key = searchinfo.get('session_key')
            self.logger = ItsiLogger(os.path.join(searchinfo.get('dispatch_dir'), SEARCH_LOG_FILE),
                                     searchinfo.get('command'))
            self.args = {}
            args = metadata.get('searchinfo', {}).get('args')
            # Convert Array to key value
            for arg in args:
                if arg.find("=") != -1:
                    key, value = arg.split("=")
                    self.logger.debug("Adding search command argument key=%s, value=%s", key, value)
                    self.args[key] = value
                else:
                    self.logger.warning("Invalid argument, arg=%s", arg)

            # The timeout (in seconds) when attempting to read a chunk
            self.timeout_read = self.fetch_setting_timeout_read()

        if not earliest_time:
            self.logger.warning("Earliest time (earliest_time) is undefined or zero")
            earliest_time = time.time()
        self.earliest_time = float(earliest_time)
        # Send output
        # Make sure finished is defined
        if 'finished' not in list(output_meta_data.keys()):
            output_meta_data.setdefault('finished', False)

        # Class validate function to validate search command arguments, if it fails, it should return message
        is_valid, inspector_msgs = self.validate_search_args()

        if not is_valid:
            if isinstance(inspector_msgs, list):
                self.exit_with_error(output_meta_data, inspector_msgs)
            else:
                self.exit_with_error(output_meta_data, "Invalid arguments in search command")
        else:
            if is_send_ack:
                self.write_chunk(output_meta_data, '')

        self.writer = None
        self.output_buf = None
        self.logger.debug('Search Chunk Protocol has initialized successfully.')

    def fetch_setting_timeout_read(self):
        """
        Fetches and returns the timeout conf setting that's used when reading a chunk

        @type: int
        @return: return the timeout value
        """
        try:
            value = int(get_itsi_conf_setting(self.session_key, CONF_CUSTOMSEARCH_STANZA, CONF_CUSTOMSEARCH_SETTING_TIMEOUT_READ, logger=self.logger))
        except Exception:
            value = CONF_CUSTOMSEARCH_SETTING_TIMEOUT_READ_DEFAULT
        return value

    def get_session_key(self):
        """
        Return session key

        @type: basestring
        @return: return splunkd session key
        """
        return self.session_key

    def get_search_args(self):
        """
        Get search command argument

        @type: basestring
        @return: return search command parameters
        """
        return self.args

    def read_getinfo(self, fp=None):
        """
        Read action:getinfo which is send by splunk as first chunk to external commands
        @return:
        """
        if fp is None:
            fp = self.stdin

        ret = self.read_chunk(fp)
        if not ret:
            return None
        metadata, body = ret
        return metadata, body

    def read_chunk(self, fp=None):
        """
        Read chunk and parse it

        @type fp: object
        @param fp: file descriptor to read the data from

        @rtype tuple
        @return: a tuple of metadata and body otherwise log exception and send None
        """
        worker_results = Queue()

        worker = Thread(target=self._thread_read_chunk, args=(worker_results, fp))
        worker.daemon = True
        worker.start()

        # Just let the exception occur if this fails, since the caller of this function might not handle any invalid/error return value properly
        chunk = worker_results.get(block=True, timeout=self.timeout_read)

        return chunk

    def _thread_read_chunk(self, results_queue, fp=None):
        """
        Worker routine for reading chunks
        @type results_queue: Queue
        @param results_queue: queue to place results in
        @type fp: object
        @param fp: file descriptor to read the data from
        """
        chunk = self._read_chunk(fp)
        results_queue.put(chunk)

    def _read_chunk(self, fp=None):
        """
        Read chunk and parse it

        @type fp: object
        @param fp: file descriptor to read the data from

        @rtype tuple
        @return: a tuple of metadata and body otherwise log exception and send None
        """
        if fp is None:
            fp = self.stdin

        # When search does not have any data like index=abc (abc index does not present)
        # in that case, we can't read anything from stdin so we wait until Splunk write
        # something in the pipe. Splunk will terminate if it does not write anything in the pipe

        # Because EOF and invalid index both return empty string hence using timeout to avoid infinite loop
        currentTime = time.time()

        header_chars = []

        buf = fp
        if sys.version_info >= (3, 0) and hasattr(fp, 'buffer'):
            buf = fp.buffer

        while True:
            try:
                while not fp.closed and time.time() - currentTime < self.timeout_read:
                    c = buf.read(1).decode()
                    header_chars.append(c)
                    if c == '\n':
                        break
            except Exception as e:
                self.logger.exception(e)
                return None

            header = ''.join(header_chars)
            self.logger.debug("Header='%s'", header)
            if not header or len(header) == 0:
                # If time out log and bail out
                if time.time() - currentTime >= self.timeout_read:
                    self.logger.error("Timeout while reading header of chunk command")
                    return None
                else:
                    continue
            else:
                break

        m = re.match('chunked\s+1.0\s*,\s*(?P<metadata_length>\d+)\s*,\s*(?P<body_length>\d+)\s*\n', header)  # noqa W605
        if m is None:
            self.logger.error('Failed to parse transport header: %s', header)
            return None

        try:
            metadata_length = int(m.group('metadata_length'))
            body_length = int(m.group('body_length'))
        except Exception as e:
            self.logger.exception(e)
            self.logger.error('Failed to parse metadata or body length')
            return None

        self.logger.debug('READING CHUNK %d %d', metadata_length, body_length)

        try:
            metadata_buf = buf.read(metadata_length)
            body = buf.read(body_length)
            if sys.version_info >= (3, 0):
                body = body.decode()
        except Exception as e:
            self.logger.exception(e)
            self.logger.error('Failed to read metadata or body: %s' % str(e))
            return None

        try:
            metadata = json.loads(metadata_buf)
        except Exception as e:
            self.logger.exception(e)
            self.logger.error('Failed to parse metadata JSON')
            return None

        return metadata, body

    def write_chunk(self, metadata, body, fp=None):
        """
        Send data to splunk
        @type: fp: object
        @param fp: file descriptor

        @type metadata: dict
        @param metadata: metadata which needed to be add

        @type body: basestring
        @param body: data to send to splunk. If it is None then it set to a empty string

        @return: None
        """
        self.logger.debug("Started writing chunk metadata=%s, body=%s", metadata, body)

        fs = FileStringHandler.getInstance()
        fp = fs.get_writer(fp)

        if body is None:
            body = ''
        encoded_body = fs.encode_string(body)
        metadata_buf = None
        if metadata:
            metadata_buf = fs.encode_string(json.dumps(metadata))
        fp.write(fs.encode_string('chunked 1.0,%d,%d\n' % (len(metadata_buf) if metadata_buf else 0, len(encoded_body))))
        if metadata:
            fp.write(metadata_buf)
        fp.write(encoded_body)
        fp.flush()
        self.logger.debug('Successfully write chunk.')

    def add_inspector_msg(self, metadata, level, msg):
        """
        Add inspector message to meta data to show up on splunkd. If error is ERROR level then it is shown as error
        message of the command

        @type metadata: dict
        @param metadata: meta data of chunk write protocol if it is not defined then set as empty dict

        @type level - basestring (INFO|DEBUG|ERROR...)
        @param level: message level

        @type msg: basestring
        @param msg: message

        @rtype dict
        @return: return metadata back
        """
        if metadata is None:
            metadata = {}

        inspector = metadata.setdefault('inspector', {})
        msgs = inspector.setdefault('messages', [])
        if level is not None and msg is not None:
            self.logger.debug('inspector message level=%s, message=%s', level, msg)
            msgs.append([level, msg])
        return metadata

    def exit_with_error(self, metadata, msgs):
        """
        Exit command with error message
        @type metadata: dict
        @param metadata: metadata

        @type msgs: list
        @param msgs: message name

        @return: None
        """
        if metadata is None:
            metadata = {}
        metadata['finished'] = True
        for msg in msgs:
            metadata = self.add_inspector_msg(metadata, 'ERROR', msg)
        self.write_chunk(metadata, '')
        self.logger.error('Existing external command because of error=%s', msg)
        sys.exit(1)

    def get_string_buffer(self):
        """
        Instead of dealing with a string, you should use StringIO buffer which increases
        performance by a factor of two.

        Return StringIO buffer object

        @rtype: StringIO
        @return: object
        """
        import sys
        if sys.version_info >= (3, 0):
            from io import StringIO
        else:
            from StringIO import StringIO
        return StringIO()

    def validate_search_args(self):
        """
        (Optional) overwrite

        This function needs to be over written if there is any additional check requires for search commands
         parameters. This function is invoke during phase 0 getinfo exchange (__init__ function)

        @rtype: tuple
        @return: Tuple of flag and array of error message
        """
        return True, []

    def pre_processing(self):
        """
        (Optional) overwrite
        This function should be overwritten if you want to perform some operation before it starts reading data from
        splunk after getinfo exchange
        @return:
        """
        pass

    def post_processing(self):
        """
        (optional) overwrite
        This function should be overwritten if you want to perform some operation after it sends all data from
        splunk after getinfo exchange
        @return:
        """
        pass

    def execute(self):
        """
        Main function where we start reading data from splunkd
        @return:
        """
        self.pre_processing()
        self.logger.debug("Start reading chunk data...")
        finished = False
        # Count chunk for no data case
        chunk = 0

        while not finished:
            ret = self.read_chunk(sys.stdin)

            self.logger.debug("Read data=%s", ret)
            # break when no more data
            if not ret:
                break
            metadata, body = ret

            finished = metadata.get('finished', False)

            reader = csv.DictReader(body.splitlines())
            self.output_buf = self.get_string_buffer()
            self.writer = get_csv_dict_writer(self.output_buf, fieldnames=self.get_field_names(reader))

            self.run(metadata, reader, chunk)

            chunk += 1

        self.post_processing()
        self.logger.debug("Finished sending data...")

    def get_field_names(self, reader):
        return reader.fieldnames or []

    @abc.abstractmethod
    def run(self, metadata, reader, chunk):
        """
        Must override by inherit class
            - Read the passed data and write to splunkd either in chunks or yield based upon
            type of search but you must write back to splunkd

        @type metadata: dict
        @param metadata: meta send by splunkd

        @type body: string delimiter by \n
        @param body: data send by splunkd

        @type reader: csv reader
        @param reader: data send by splunkd string delimiter by \n as a csvreader

        @type chunk: int
        @param chunk: count for which chunk are you processing

        @return: None
        """
        pass


class Singleton(object):
    """
    A non-thread-safe helper class to ease implementing singletons.
    This should be used as a decorator -- not a metaclass -- to the
    class that should be a singleton.
    The decorated class can define an `__init__` function

    To get the singleton instance, use the `getInstance` method. Trying
    to use `__call__` will result in a `TypeError` being raised.

    Limitations: The decorated class cannot be inherited from itself.

    Note: This class is repeated in SA-UserAccess.
    """
    def __init__(self, decorated):
        self._decorated = decorated

    def __call__(self):
        raise TypeError('Use `getInstance()` to access the Singleton')

    def __instancecheck__(self, inst):
        return isinstance(inst, self._decorated)

    def getInstance(self, **kwargs):
        """
        returns the singleton instance of the decorated object.
        When called first, call the init method of the decorated object
        Thereafter, return the object that was created first.
        """
        try:
            if hasattr(self._instance, 'update_singleton'):
                self._instance.update_singleton(**kwargs)
            return self._instance
        except AttributeError:
            self._instance = self._decorated(**kwargs)
            return self._instance


@Singleton
class FileStringHandler(object):
    """
    An utility class handles file and string encoding between py2 and py3 mode
    This class ensures the following:
        1. On Windows, the \n character at the end is mapped to \r\n instead.
           This causes the length of the string to be different than the length of the string
           reported in the transport header, which comes from calling len on the string while
           it only contains \n.
           The fix is to encode the string to ensure the a correct string len. The encoding
           is needed to also handle the unicode case anyway.
        2. Allocate the fp buffer when writing the encoded string.
    """
    def __init__(self):
        self.py3 = False
        if sys.version_info >= (3, 0):
            self.py3 = True

    def get_writer(self, fh):
        if fh is None:
            fh = sys.stdout
        return_fh = fh.buffer if self.py3 and hasattr(fh, 'buffer') else fh
        return return_fh

    def encode_string(self, s):
        return_s = s.encode('utf-8') if self.py3 else s
        return return_s