You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
117 lines
3.8 KiB
117 lines
3.8 KiB
#!/usr/bin/env python
|
|
|
|
import pandas as pd
|
|
from sklearn.linear_model import LogisticRegression as _LogisticRegression
|
|
|
|
from base import BaseAlgo, ClassifierMixin
|
|
from codec import codecs_manager
|
|
from util import df_util
|
|
from util.param_util import convert_params, is_truthy
|
|
|
|
|
|
class LogisticRegression(ClassifierMixin, BaseAlgo):
|
|
def __init__(self, options):
|
|
self.handle_options(options)
|
|
|
|
out_params = convert_params(
|
|
options.get('params', {}), bools=['fit_intercept', 'probabilities']
|
|
)
|
|
|
|
if 'probabilities' in out_params:
|
|
del out_params['probabilities']
|
|
|
|
self.estimator = _LogisticRegression(class_weight='balanced', **out_params)
|
|
|
|
def apply(self, df, options):
|
|
# Make a copy of data, to not alter original dataframe
|
|
X = df.copy()
|
|
|
|
# Prepare the dataset
|
|
X, nans, columns = df_util.prepare_features(
|
|
X=X,
|
|
variables=self.feature_variables,
|
|
final_columns=self.columns,
|
|
mlspl_limits=options.get('mlspl_limits'),
|
|
)
|
|
# Make predictions
|
|
y_hat = self.estimator.predict(X.values)
|
|
|
|
# Assign output_name
|
|
default_name = 'predicted({})'.format(self.target_variable)
|
|
output_name = options.get('output_name', default_name)
|
|
|
|
# Create output
|
|
output = df_util.create_output_dataframe(
|
|
y_hat=y_hat, nans=nans, output_names=output_name
|
|
)
|
|
if self.check_probabilities(options):
|
|
# predict probs
|
|
y_hat_proba = self.estimator.predict_proba(X.values)
|
|
|
|
# get names
|
|
class_names = [
|
|
'probability({}={})'.format(self.target_variable, cls_name)
|
|
for cls_name in self.estimator.classes_
|
|
]
|
|
|
|
# create output data frame
|
|
output_proba = df_util.create_output_dataframe(
|
|
y_hat=y_hat_proba, nans=nans, output_names=class_names
|
|
)
|
|
# combine
|
|
output = pd.concat([output, output_proba], axis=1)
|
|
|
|
df = df_util.merge_predictions(df, output)
|
|
return df
|
|
|
|
@staticmethod
|
|
def check_probabilities(options):
|
|
out_params = convert_params(
|
|
options.get('params', {}), bools=['probabilities'], ignore_extra=True
|
|
)
|
|
|
|
if 'probabilities' in out_params:
|
|
probabilities = is_truthy(out_params['probabilities'])
|
|
else:
|
|
probabilities = False
|
|
return probabilities
|
|
|
|
def summary(self, options):
|
|
if len(options) != 2: # only model name and mlspl_limits
|
|
raise RuntimeError(
|
|
'"%s" models do not take options for summarization' % self.__class__.__name__
|
|
)
|
|
df = pd.DataFrame()
|
|
|
|
n_classes = len(self.estimator.classes_)
|
|
limit = 1 if n_classes == 2 else n_classes
|
|
|
|
for i, c in enumerate(self.estimator.classes_[:limit]):
|
|
cdf = pd.DataFrame(
|
|
{'feature': self.columns, 'coefficient': self.estimator.coef_[i].ravel()}
|
|
)
|
|
if not isinstance(self.estimator.intercept_, float):
|
|
cdf = cdf.append(
|
|
pd.DataFrame(
|
|
{
|
|
'feature': ['_intercept'],
|
|
'coefficient': [self.estimator.intercept_[i]],
|
|
}
|
|
)
|
|
)
|
|
cdf['class'] = c
|
|
df = df.append(cdf)
|
|
|
|
return df
|
|
|
|
@staticmethod
|
|
def register_codecs():
|
|
from codec.codecs import SimpleObjectCodec
|
|
|
|
codecs_manager.add_codec(
|
|
'algos.LogisticRegression', 'LogisticRegression', SimpleObjectCodec
|
|
)
|
|
codecs_manager.add_codec(
|
|
'sklearn.linear_model._logistic', 'LogisticRegression', SimpleObjectCodec
|
|
)
|