You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/algos/LogisticRegression.py

117 lines
3.8 KiB

#!/usr/bin/env python
import pandas as pd
from sklearn.linear_model import LogisticRegression as _LogisticRegression
from base import BaseAlgo, ClassifierMixin
from codec import codecs_manager
from util import df_util
from util.param_util import convert_params, is_truthy
class LogisticRegression(ClassifierMixin, BaseAlgo):
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}), bools=['fit_intercept', 'probabilities']
)
if 'probabilities' in out_params:
del out_params['probabilities']
self.estimator = _LogisticRegression(class_weight='balanced', **out_params)
def apply(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Prepare the dataset
X, nans, columns = df_util.prepare_features(
X=X,
variables=self.feature_variables,
final_columns=self.columns,
mlspl_limits=options.get('mlspl_limits'),
)
# Make predictions
y_hat = self.estimator.predict(X.values)
# Assign output_name
default_name = 'predicted({})'.format(self.target_variable)
output_name = options.get('output_name', default_name)
# Create output
output = df_util.create_output_dataframe(
y_hat=y_hat, nans=nans, output_names=output_name
)
if self.check_probabilities(options):
# predict probs
y_hat_proba = self.estimator.predict_proba(X.values)
# get names
class_names = [
'probability({}={})'.format(self.target_variable, cls_name)
for cls_name in self.estimator.classes_
]
# create output data frame
output_proba = df_util.create_output_dataframe(
y_hat=y_hat_proba, nans=nans, output_names=class_names
)
# combine
output = pd.concat([output, output_proba], axis=1)
df = df_util.merge_predictions(df, output)
return df
@staticmethod
def check_probabilities(options):
out_params = convert_params(
options.get('params', {}), bools=['probabilities'], ignore_extra=True
)
if 'probabilities' in out_params:
probabilities = is_truthy(out_params['probabilities'])
else:
probabilities = False
return probabilities
def summary(self, options):
if len(options) != 2: # only model name and mlspl_limits
raise RuntimeError(
'"%s" models do not take options for summarization' % self.__class__.__name__
)
df = pd.DataFrame()
n_classes = len(self.estimator.classes_)
limit = 1 if n_classes == 2 else n_classes
for i, c in enumerate(self.estimator.classes_[:limit]):
cdf = pd.DataFrame(
{'feature': self.columns, 'coefficient': self.estimator.coef_[i].ravel()}
)
if not isinstance(self.estimator.intercept_, float):
cdf = cdf.append(
pd.DataFrame(
{
'feature': ['_intercept'],
'coefficient': [self.estimator.intercept_[i]],
}
)
)
cdf['class'] = c
df = df.append(cdf)
return df
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec
codecs_manager.add_codec(
'algos.LogisticRegression', 'LogisticRegression', SimpleObjectCodec
)
codecs_manager.add_codec(
'sklearn.linear_model._logistic', 'LogisticRegression', SimpleObjectCodec
)