You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
124 lines
4.8 KiB
124 lines
4.8 KiB
#!/usr/bin/env python
|
|
|
|
from pandas import DataFrame
|
|
from sklearn.ensemble import GradientBoostingClassifier as _GradientBoostingClassifier
|
|
|
|
from cexc import get_messages_logger
|
|
from base import ClassifierMixin, BaseAlgo
|
|
from util.param_util import convert_params
|
|
from util.algo_util import handle_max_features
|
|
from codec import codecs_manager
|
|
from codec.codecs import SimpleObjectCodec, NoopCodec
|
|
|
|
messages = get_messages_logger()
|
|
|
|
|
|
class GradientBoostingClassifier(ClassifierMixin, BaseAlgo):
|
|
def __init__(self, options):
|
|
self.handle_options(options)
|
|
params = options.get('params', {})
|
|
out_params = convert_params(
|
|
params,
|
|
strs=['loss', 'max_features'],
|
|
floats=['learning_rate', 'min_weight_fraction_leaf'],
|
|
ints=[
|
|
'n_estimators',
|
|
'max_depth',
|
|
'min_samples_split',
|
|
'min_samples_leaf',
|
|
'max_leaf_nodes',
|
|
'random_state',
|
|
],
|
|
)
|
|
|
|
valid_loss = ['deviance', 'exponential']
|
|
if 'loss' in out_params:
|
|
if out_params['loss'] not in valid_loss:
|
|
msg = "loss must be one of: {}".format(', '.join(valid_loss))
|
|
raise RuntimeError(msg)
|
|
|
|
if 'max_features' in out_params:
|
|
out_params['max_features'] = handle_max_features(out_params['max_features'])
|
|
|
|
if 'max_leaf_nodes' in out_params and 'max_depth' in out_params:
|
|
messages.warn('max_depth ignored when max_leaf_nodes is set')
|
|
|
|
self.estimator = _GradientBoostingClassifier(**out_params)
|
|
|
|
def apply(self, df, options):
|
|
# needed for backward compatibility with sklearn 0.17
|
|
# since n_features_ was added in version 0.18
|
|
self.estimator.n_features_ = len(self.columns)
|
|
return super(GradientBoostingClassifier, self).apply(df, options)
|
|
|
|
def summary(self, options):
|
|
if len(options) != 2: # only model name and mlspl_limits
|
|
msg = '"%s" models do not take options for summarization' % self.__class__.__name__
|
|
raise RuntimeError(msg)
|
|
df = DataFrame(
|
|
{'feature': self.columns, 'importance': self.estimator.feature_importances_.ravel()}
|
|
)
|
|
return df
|
|
|
|
@staticmethod
|
|
def register_codecs():
|
|
from codec.codecs import TreeCodec
|
|
|
|
codecs_manager.add_codec(
|
|
'algos.GradientBoostingClassifier', 'GradientBoostingClassifier', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec('sklearn.ensemble._gb', 'GradientBoostingClassifier', GBTCodec)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble._gb_losses', 'BinomialDeviance', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble._gb_losses', 'MultinomialDeviance', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble._gb_losses', 'ExponentialLoss', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble.gradient_boosting', 'LogOddsEstimator', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble.gradient_boosting', 'ScaledLogOddsEstimator', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble.gradient_boosting', 'BinomialDeviance', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.ensemble.gradient_boosting', 'PriorProbabilityEstimator', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.tree._classes', 'DecisionTreeRegressor', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
|
|
codecs_manager.add_codec('sklearn._loss.loss', 'HalfMultinomialLoss', SimpleObjectCodec)
|
|
codecs_manager.add_codec('_loss', 'CyHalfMultinomialLoss', NoopCodec)
|
|
codecs_manager.add_codec('sklearn._loss.link', 'MultinomialLogit', SimpleObjectCodec)
|
|
codecs_manager.add_codec('sklearn._loss.link', 'Interval', SimpleObjectCodec)
|
|
codecs_manager.add_codec('sklearn._loss.loss', 'ExponentialLoss', SimpleObjectCodec)
|
|
codecs_manager.add_codec('_loss', 'CyExponentialLoss', NoopCodec)
|
|
codecs_manager.add_codec('sklearn._loss.link', 'HalfLogitLink', SimpleObjectCodec)
|
|
|
|
|
|
class GBTCodec(SimpleObjectCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
import sklearn.ensemble
|
|
|
|
is_clf = type(obj) == sklearn.ensemble.GradientBoostingClassifier
|
|
is_reg = type(obj) == sklearn.ensemble.GradientBoostingRegressor
|
|
assert is_clf or is_reg
|
|
|
|
obj.estimators_ = obj.estimators_.tolist()
|
|
return SimpleObjectCodec.encode(obj)
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
import numpy as np
|
|
|
|
obj = SimpleObjectCodec.decode(obj)
|
|
obj.estimators_ = np.array(obj.estimators_)
|
|
return obj
|