You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

205 lines
6.7 KiB

#!/usr/bin/env python
# Copyright (C) 2015-2019 Splunk Inc. All Rights Reserved.
from exec_anaconda import exec_anaconda_or_die
exec_anaconda_or_die()
import os
import conf
from util.param_util import is_truthy, parse_args, convert_params
from util.telemetry_util import log_uuid, log_fit_time, log_app_details, log_partial_fit, Timer
from util import command_util
import cexc
from chunked_controller import ChunkedController
from cexc import BaseChunkHandler, CommandType
logger = cexc.get_logger('fit')
messages = cexc.get_messages_logger()
class FitCommand(cexc.BaseChunkHandler):
"""FitCommand uses ChunkedController & one of two processors to fit models.
The FitCommand can use either the FitBatchProcessor or the FitPartialProcessor,
which is chosen based on the presence of the partial_fit parameter.
"""
@staticmethod
def handle_arguments(getinfo):
"""Take the getinfo metadata and return controller_options.
Args:
getinfo (dict): getinfo metadata from first chunk
Returns:
controller_options (dict): options to be passed to controller
partial_fit (bool): boolean flag to indicate partial fit
"""
if len(getinfo['searchinfo']['raw_args']) == 0:
raise RuntimeError('First argument must be an "algorithm"')
raw_options = parse_args(getinfo['searchinfo']['raw_args'][1:])
controller_options, partial_fit = FitCommand.handle_raw_options(raw_options)
controller_options['algo_name'] = getinfo['searchinfo']['args'][0]
log_app_details(getinfo['searchinfo'].get('app'))
return controller_options, partial_fit
@staticmethod
def handle_raw_options(controller_options):
"""Load command specific options.
Args:
controller_options (dict): options from handle_arguments
Returns:
controller_options (dict): dict of controller options
partial_fit (dict): boolean flag for partial fit
"""
controller_options['processor'] = 'FitBatchProcessor'
partial_fit = False
if 'params' in controller_options:
try:
fit_params = convert_params(
params=controller_options['params'],
ignore_extra=True,
bools=['apply', 'partial_fit'],
)
except ValueError as e:
raise RuntimeError(str(e))
if 'apply' in fit_params:
controller_options['apply'] = fit_params['apply']
del controller_options['params']['apply']
if 'model_name' not in controller_options and not fit_params['apply']:
raise RuntimeError('You must save a model if you are not applying it.')
if 'partial_fit' in fit_params:
partial_fit = fit_params['partial_fit']
del controller_options['params']['partial_fit']
if partial_fit:
log_partial_fit()
controller_options['processor'] = 'FitPartialProcessor'
return controller_options, partial_fit
def _setup_watchdog(self):
"""Initialize and start watchdog"""
self.watchdog = command_util.get_watchdog(
self.controller.resource_limits['max_fit_time'],
self.controller.resource_limits['max_memory_usage_mb'],
os.path.join(self.getinfo['searchinfo']['dispatch_dir'], 'finalize'),
)
self.watchdog.start()
def setup(self):
"""Get options, start controller and return command type.
Returns:
(dict): get info response (command type) and required fields
"""
self.controller_options, self.partial_fit = self.handle_arguments(self.getinfo)
self.controller = ChunkedController(self.getinfo, self.controller_options)
required_fields = self.controller.get_required_fields()
return {'type': CommandType.EVENTS, 'required_fields': required_fields}
def get_output_body(self):
"""Collect output body from controller.
Returns:
(str): body
"""
return self.controller.output_results()
def handler(self, metadata, body):
"""Main handler we override from BaseChunkHandler.
Args:
metadata (dict): metadata information
body (str): data payload from CEXC
Returns:
(dict): metadata to be sent back to CEXC
output_body (str): data payload to be sent back to CEXC
"""
if command_util.should_early_return(metadata):
return {'type': CommandType.EVENTS}
if command_util.is_getinfo_chunk(metadata):
return self.setup()
if self.getinfo.get('preview', False):
logger.debug('Not running in preview.')
return {'finished': True}
if not self.watchdog:
self._setup_watchdog()
finished_flag = metadata.get('finished', False)
self.controller.load_data(body)
# Partial fit should *always* execute on every chunk.
# Non partial fit will execute on the last chunk.
if self.partial_fit or finished_flag:
self.controller.execute()
output_body = self.get_output_body()
else:
output_body = None
if finished_flag:
self.controller.finalize()
# Gracefully terminate watchdog
if self.watchdog.started:
self.watchdog.join()
# Our final farewell
self.log_performance_timers()
return ({'finished': finished_flag}, output_body)
def log_performance_timers(self):
logger.debug(
"read_time=%f, handle_time=%f, write_time=%f, "
"csv_parse_time=%f, csv_render_time=%f"
% (
self._read_time,
self._handle_time,
self._write_time,
self.controller._csv_parse_time,
self.controller._csv_render_time,
)
)
if __name__ == "__main__":
logger.debug("Starting fit.py.")
do_profile = is_truthy(conf.get_mlspl_prop('profile', 'default', 'n'))
if do_profile:
import cProfile
import pstats
pr = cProfile.Profile()
pr.enable()
with Timer() as t:
FitCommand(handler_data=BaseChunkHandler.DATA_RAW).run()
if do_profile:
from io import StringIO
pr.disable()
s = StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
ps.print_stats(10)
ps = pstats.Stats(pr, stream=s).sort_stats('time')
ps.print_stats(10)
logger.info("PROFILE: %s", s.getvalue())
log_uuid()
log_fit_time(t.interval)
logger.debug("Exiting gracefully. Byee!!")