You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
184 lines
6.2 KiB
184 lines
6.2 KiB
#!/usr/bin/env python
|
|
|
|
import copy
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from sklearn.feature_selection import GenericUnivariateSelect, f_classif, f_regression
|
|
from sklearn import __version__ as sklearn_version
|
|
|
|
import cexc
|
|
from distutils.version import StrictVersion
|
|
from base import BaseAlgo
|
|
from codec import codecs_manager
|
|
from codec.codecs import BaseCodec
|
|
from util.param_util import convert_params
|
|
from util import df_util
|
|
|
|
|
|
messages = cexc.get_messages_logger()
|
|
|
|
required_version = '0.19.0'
|
|
|
|
|
|
def has_required_version():
|
|
return StrictVersion(sklearn_version) >= StrictVersion(required_version)
|
|
|
|
|
|
class GenericUnivariateSelectCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
obj = copy.deepcopy(obj)
|
|
obj.score_func = obj.score_func.__name__
|
|
|
|
return {
|
|
'__mlspl_type': [type(obj).__module__, type(obj).__name__],
|
|
'dict': obj.__dict__,
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect
|
|
|
|
new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect)
|
|
new_obj.__dict__ = obj['dict']
|
|
|
|
if new_obj.score_func == 'f_classif':
|
|
new_obj.score_func = f_classif
|
|
elif new_obj.score_func == 'f_regression':
|
|
new_obj.score_func = f_regression
|
|
else:
|
|
raise ValueError(
|
|
'Unsupported GenericUnivariateSelect.score_func "%s"' % new_obj.score_func
|
|
)
|
|
|
|
return new_obj
|
|
|
|
|
|
class FieldSelector(BaseAlgo):
|
|
def __init__(self, options):
|
|
self.handle_options(options)
|
|
|
|
out_params = convert_params(
|
|
options.get('params', {}),
|
|
floats=['param'],
|
|
strs=['type', 'mode'],
|
|
aliases={'type': 'score_func'},
|
|
)
|
|
|
|
if 'score_func' not in out_params:
|
|
out_params['score_func'] = f_classif
|
|
else:
|
|
if out_params['score_func'].lower() == 'categorical':
|
|
out_params['score_func'] = f_classif
|
|
elif out_params['score_func'].lower() in ['numerical', 'numeric']:
|
|
out_params['score_func'] = f_regression
|
|
else:
|
|
raise RuntimeError('type can either be categorical or numeric.')
|
|
|
|
if 'mode' in out_params:
|
|
if out_params['mode'] not in ('k_best', 'fpr', 'fdr', 'fwe', 'percentile'):
|
|
raise RuntimeError(
|
|
'mode can only be one of the following: fdr, fpr, fwe, k_best, and percentile'
|
|
)
|
|
|
|
if out_params['mode'] in ['fpr', 'fdr', 'fwe']:
|
|
if 'param' in out_params:
|
|
if not 0 < out_params['param'] < 1:
|
|
msg = 'Invalid param value for mode {}: param must be between 0 and 1.'.format(
|
|
out_params['mode']
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
# k_best and percentile require integer param
|
|
if 'param' in out_params and out_params.get('mode') not in ['fdr', 'fpr', 'fwe']:
|
|
original_value = out_params['param']
|
|
out_params['param'] = int(out_params['param'])
|
|
if out_params['param'] != original_value:
|
|
msg = 'param value {} is not an integer; mode={} requires an integer.'
|
|
msg = msg.format(original_value, out_params.get('mode', 'percentile'))
|
|
raise ValueError(msg)
|
|
|
|
self.estimator = GenericUnivariateSelect(**out_params)
|
|
|
|
def handle_options(self, options):
|
|
if (
|
|
len(options.get('target_variable', [])) != 1
|
|
or len(options.get('feature_variables', [])) == 0
|
|
):
|
|
raise RuntimeError('Syntax error: expected "<target> FROM <field> ..."')
|
|
|
|
def fit(self, df, options):
|
|
# Make a copy of data, to not alter original dataframe
|
|
X = df.copy()
|
|
|
|
relevant_variables = self.feature_variables + [self.target_variable]
|
|
X, y, self.columns = df_util.prepare_features_and_target(
|
|
X=X,
|
|
variables=relevant_variables,
|
|
target=self.target_variable,
|
|
mlspl_limits=options.get('mlspl_limits'),
|
|
)
|
|
self.estimator.fit(X.values, y.values)
|
|
|
|
def apply(self, df, options):
|
|
# Make a copy of data, to not alter original dataframe
|
|
X = df.copy()
|
|
|
|
X, nans, columns = df_util.prepare_features(
|
|
X=X,
|
|
variables=self.feature_variables,
|
|
final_columns=self.columns,
|
|
mlspl_limits=options.get('mlspl_limits'),
|
|
)
|
|
y_hat = self.estimator.transform(X.values)
|
|
mask = self.estimator.get_support()
|
|
columns_select = np.array(self.columns)[mask]
|
|
width = len(columns_select)
|
|
|
|
if width == 0:
|
|
messages.warn(
|
|
'No fields pass the current configuration. Consider changing your parameters.'
|
|
)
|
|
|
|
default_name = 'fs'
|
|
output_name = options.get('output_name', default_name)
|
|
output_names = [output_name + '_%s' % x for x in columns_select]
|
|
|
|
output = df_util.create_output_dataframe(
|
|
y_hat=y_hat, nans=nans, output_names=output_names
|
|
)
|
|
|
|
df = df_util.merge_predictions(df, output)
|
|
return df
|
|
|
|
def summary(self, options):
|
|
"""Only model_name and mlspl_limits are supported for summary"""
|
|
if len(options) != 2:
|
|
raise RuntimeError(
|
|
'{class_name} models do not take options for summarization'.format(
|
|
class_name=self.__class__.__name__
|
|
)
|
|
)
|
|
|
|
df = pd.DataFrame(
|
|
{
|
|
'feature_variables': self.feature_variables,
|
|
'score': self.estimator.scores_.round(4),
|
|
'p-value': self.estimator.pvalues_.round(4),
|
|
},
|
|
index=self.feature_variables,
|
|
)
|
|
return df
|
|
|
|
@staticmethod
|
|
def register_codecs():
|
|
from codec.codecs import SimpleObjectCodec
|
|
|
|
codecs_manager.add_codec('algos.FieldSelector', 'FieldSelector', SimpleObjectCodec)
|
|
codecs_manager.add_codec(
|
|
'sklearn.feature_selection._univariate_selection',
|
|
'GenericUnivariateSelect',
|
|
GenericUnivariateSelectCodec,
|
|
)
|