SH-Deployer/apps/Splunk_ML_Toolkit/bin/logexperiment.py

#!/usr/bin/env python
# Copyright (C) 2015-2019 Splunk Inc. All Rights Reserved.
from exec_anaconda import exec_anaconda_or_die

exec_anaconda_or_die()

import json
import re
import sys
import cexc
from cexc import BaseChunkHandler, CommandType

from util.rest_proxy import rest_proxy_from_searchinfo
from util import command_util
from util.searchinfo_util import searchinfo_from_cexc
from util.param_util import parse_args
from util.rest_url_util import make_splunk_url
from util.experiment_util import get_experiment_by_id, get_history_fields_from_experiment
from experiment.evaluation_metrics import compute_statistics, get_statistics_metadata

logger = cexc.get_logger('logexperiment')
messages = cexc.get_messages_logger()


class LogExperimentCommand(BaseChunkHandler):
    """LogExperimentCommand logs the results of an experiment."""

    def __init__(
        self,
        handler_data=None,
        in_file=sys.stdin.buffer,
        out_file=sys.stdout.buffer,
        err_file=sys.stderr,
    ):
        super(LogExperimentCommand, self).__init__(handler_data, in_file, out_file, err_file)
        self.exp_id = None
        self.app = None
        self.searchinfo = None
        self.experiment = None
        self.exp_metadata_list = []

    @staticmethod
    def handle_arguments(getinfo):
        """Take the getinfo metadata and return controller_options.

        Args:
            getinfo (dict): getinfo metadata

        Returns:
            controller_options (dict): options to be sent to controller
        """
        options = parse_args(getinfo['searchinfo']['args'])

        if options.get('params') is None or options['params'].get('id') is None:
            raise RuntimeError('Experiment ID must be specified, e.g: logexperiment id=... ')

        return options

    def setup(self):
        """Parse search string and choose processor.

        Returns:
            (dict): get info response (command type) and required fields. This
                response will be sent back to the CEXC process on the getinfo
                exchange (first chunk) to establish our execution type and
                required fields.
        """
        options = self.handle_arguments(self.getinfo)
        self.exp_id = options['params']['id']

        # The 'app' argument value is needed to correctly locate the experiment
        # as it may be a different app than the current app context this is invoked from.
        # By default, it's the current app the command is executed from, but override it
        # if the user specified a value for the app arg.
        app = options['params'].get("app")
        if app:
            self.searchinfo["app"] = app

        return {'type': CommandType.REPORTING}

    @staticmethod
    def _from_schedule(sid):
        # Assume no realtime data (realtime searches start with 'rt_scheduler__').
        return re.match(r'scheduler__', sid) is not None

    def handler(self, metadata, body):
        """Main handler we override from BaseChunkHandler.

        Handles the reading and writing of data to the CEXC process, and
        finishes negotiation of the termination of the process.

        Args:
            metadata (dict): metadata information
            body (str): data payload from CEXC

        Returns:
            (dict): metadata to be sent back to CEXC
            output_body (str): data payload to be sent back to CEXC
        """
        if command_util.should_early_return(metadata):
            return {'type': CommandType.REPORTING}

        if command_util.is_getinfo_chunk(metadata):
            self.searchinfo = searchinfo_from_cexc(metadata['searchinfo'], extra_fields=['sid'])
            return self.setup()

        # Save info we need to calculate stats when we process the final chunk.
        if self.experiment is None:
            rest_proxy = rest_proxy_from_searchinfo(self.searchinfo)
            self.experiment = get_experiment_by_id(rest_proxy, self.exp_id)

        self.exp_metadata_list.append(get_statistics_metadata(self.experiment, body))

        finished_flag = metadata.get('finished', False)
        if finished_flag:
            rest_proxy = rest_proxy_from_searchinfo(self.searchinfo)
            experiment = get_experiment_by_id(rest_proxy, self.exp_id)
            experiment_history = get_history_fields_from_experiment(experiment)
            sid = self.searchinfo['sid']
            json_body = {'sid': sid, 'from_schedule': self._from_schedule(sid)}
            # Update json_body with experiment history
            json_body.update(experiment_history)

            # Update the json body with statistics. If statistics can't be computed, update with empty dictionary.
            statistics_dict = compute_statistics(self.exp_metadata_list)
            json_body.update(statistics_dict)

            # Send json_body to the history store
            url = make_splunk_url(
                rest_proxy,
                'user',
                extra_url_parts=['mltk', 'experiments', self.exp_id, 'history'],
            )
            reply = rest_proxy.make_rest_call('POST', url, jsonargs=json.dumps(json_body))
            if not reply['success']:
                content = reply['content']
                logger.warn(content)
                raise RuntimeError(json.loads(content)['messages'][0]['text'])

        # Our final farewell
        return {'finished': finished_flag}, body


if __name__ == "__main__":
    logger.debug("Starting logexperiment.py.")
    LogExperimentCommand(handler_data=BaseChunkHandler.DATA_RAW).run()
    logger.debug("Exiting gracefully. Byee!!")