You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

673 lines
24 KiB

#!/usr/bin/env python
import numpy as np
import cexc
from util import df_util
from util.base_util import MLSPLNotImplementedError
from util import algo_util
messages = cexc.get_messages_logger()
class BaseAlgo(object):
"""The BaseAlgo class defines the interface for ML-SPL algorithms.
All of the relevant entry and exit points to the algo, methods, and special
attributes are listed below. Inheriting from the BaseAlgo class is not
required - however, doing so will ensure that the algorithm implements the
required methods or if that method is called, an error is raised.
"""
def __init__(self, options):
"""The initialization function.
This method is **required**. The __init__ method provides the chance to
check grammar, convert parameters passed into the search, and initialize
additional objects or imports needed by the algorithm. If none of these
things are needed, a simple pass or return is sufficient.
This will be called before the first batch of data comes in.
The `options` argument passed to this method is closely related to the
SPL search query. For a simple query such as:
| fit LinearRegression sepal_width from petal* fit_intercept=t
The `options` returned will be:
{
'args': [u'sepal_width', u'petal*'],
'params': {u'fit_intercept': u't'},
'feature_variables': ['petal*'],
'target_variable': ['sepal_width']
'algo_name': u'LinearRegression',
'mlspl_limits': { .. },
}
This dictionary of `options` includes:
- args (list): a list of the fields used
- params (dict): any parameters (key-value) pairs in the search
- feature_variables (list): fields to be used as features
- target_variable (str): the target field for prediction
- algo_name (str): the name of algorithm
- mlspl_limits (dict): mlspl.conf stanza properties that may be used in utility methods
Other keys that may exist depending on the search:
- model_name (str): the name of the model being saved ('into' clause)
- output_name (str): the name of the output field ('as' clause)
- split_by (list): fields by which input rows will be split into groups
The feature_fields and target_field are related to the syntax of the
search as well. If a 'from' clause is present:
| fit LinearRegression target_variable from feature_variables
whereas with an unsupervised algorithm such as KMeans,
| fit KMeans feature_variables
It is important to note that these feature_variables in the `options`
have not been wildcard matched against the available data, meaning that
if there is a wildcard * in the field names, the wildcards are still
present.
"""
self.feature_variables = []
self.target_variable = None
msg = 'The {} algorithm cannot be initialized.'
msg = msg.format(self.__class__.__name__)
raise MLSPLNotImplementedError(msg)
def fit(self, df, options):
"""The fit method creates and updates a model - it may make predictions.
The fit method is only called during the fit command and is **required**.
The fit method is the central and most important part of adding an algo.
After the __init__ has been called, the field wildcards have been matched
and the available variables are now attached to two attributes on self:
self.feature_variables (list): fields to use for predicting
and if the search uses a 'from' clause:
self.target_variable (str): the field to predict
If the algorithm necessarily makes predictions while fitting, return
the output DataFrame here. Additionally, if the algorithm cannot be
saved, make predictions and return them. Otherwise, make predictions in
the apply method and do not return anything here.
The `df` argument is a pandas DataFrame from the search results. Note
that modification to `df` within this method will also modify the
dataframe to be used in the subsequent apply method.
The `options` argument is the same as those described in the __init__
method.
"""
msg = 'The {} algorithm does not support fit.'
msg = msg.format(self.__class__.__name__)
raise MLSPLNotImplementedError(msg)
def partial_fit(self, df, options):
"""The partial_fit method updates a model incrementally.
partial_fit is used in the fit command when partial_fit=t is added to the
search. It is for incrementally updating an algorithm. If the algorithm
does not require a full dataset in order to update, partial_fit can
be used to update the estimator with each "chunk" of data, rather than
waiting for the full dataset to arrive.
On the initial partial_fit, the `options` are the same as described in
the fit method, however, on the subsequent calls - the `options` from the
initial fit are used.
The `df` argument is a pandas DataFrame from the search results.
The `options` argument is the same as those described in the __init__
method.
"""
msg = 'The {} algorithm does not support partial_fit.'
msg = msg.format(self.__class__.__name__)
raise MLSPLNotImplementedError(msg)
def apply(self, df, options):
"""The apply method creates predictions.
The apply method is used in both the fit command and the apply command.
In the fit command, the apply method is used when the fit method does
not return a pandas DataFrame. If apply=f is added to the fit command,
the apply method will not be called.
In the apply command, this method is always called. The apply method is
only necessary when a saved model is needed.
When the apply method is used in the fit command, the `options` are the
same as those in the __init__ method. A search like this:
| fit LinearRegression y from X1 X2
would return `options` in the apply method like this:
{
'args': [u'y', u'X1', 'X2'],
'algo_name': u'LinearRegression',
'feature_variables': ['X1', 'X2'],
'target_variable': ['y'],
'mlspl_limits': { ... },
}
When the apply method is used in the apply command, the `options`
represent those saved with the model in addition to those passed into
the search. Algorithm specific parameters such as k=4, are ignored when
applying a model. The `options` from the following search:
| fit LogisticRegression y from X1 X2 into model as new_name
would be:
{
'args': [u'y', u'X1', 'X2'],
'algo_name': u'LogisticRegression',
'model_name': 'model',
'output_name': 'new_name',
'feature_variables': ['X1', 'X2'],
'target_variable': ['y'],
'mlspl_limits': { ... },
}
however, if applying the model so:
| apply model as some_other_name
the `options` would be:
{
'args': [u'y', u'X1', 'X2'],
'algo_name': u'LogisticRegression',
'model_name': 'model',
'output_name': 'some_other_name',
'feature_variables': ['X1', 'X2'],
'target_variable': ['y'],
'mlspl_limits': { ... },
}
where the output_name has been updated.
The `df` argument is a pandas DataFrame from the search results.
The `options` argument is the same as those described in the __init__
method.
"""
msg = 'The {} algorithm does not support apply.'
msg = msg.format(self.__class__.__name__)
raise MLSPLNotImplementedError(msg)
def summary(self, options):
"""The summary method defines how to summarize the model.
The summary method is only necessary with a saved model. This method
must return a pandas DataFrame.
By default, the `options` dictionary only returns:
{
'model_name': 'some_custom_model_name',
'mlspl_limits': { ... },
}
Parameters added to the search will be added to the `options`.
An example:
| summary my_custom_model key=value
will return
{
'model_name': 'some_custom_model_name',
'mlspl_limits': { ... },
'params': {'key': 'value'},
}
as the `options`.
"""
msg = 'The {} algorithm does not support summary.'
msg = msg.format(self.__class__.__name__)
raise MLSPLNotImplementedError(msg)
@staticmethod
def register_codecs():
"""The register codecs method defines how to save a model.
ML-SPL uses custom codecs to serialize (save) and deserialize (load)
the python objects that represent the model. The MLTK comes with a
variety of pre-defined codecs to serialize objects like numpy arrays,
pandas DataFrames, and other common python objects.
Most likely, a model can be saved by using the SimpleObjectCodec:
>>> from codec.codecs import SimpleObjectCodec
>>> codecs_manager.add_codec('algos.CustomAlgo', 'CustomAlgo', SimpleObjectCodec)
If there are imported modules from the Python for Scientific Computing
app, such as scikit-learn's StandardScaler, they must also be added:
>>> codecs_manager.add_codec('sklearn.preprocessing.data', 'StandardScaler', SimpleObjectCodec)
In the less likely chance that a algorithm has circular references or
something unusual about it, a custom codec might be required. Codecs
define how to serialize and deserialize a python object into a string.
More examples can be found in codec/codecs.py.
"""
msg = 'The algorithm does not support saving.'
raise MLSPLNotImplementedError(msg)
class RegressorMixin(object):
"""Defines methods to setup and make numeric predictions.
The RegressorMixin is useful for supervised learning problems where the
target variable is numeric. Additional methods defined here are:
- handle_options
- rename_output
See algos/LinearRegression.py for an example of using this mixin.
"""
def handle_options(self, options):
"""Utility to ensure there are both target and feature variables"""
if (
len(options.get('target_variable', [])) != 1
or len(options.get('feature_variables', [])) == 0
):
raise RuntimeError('Syntax error: expected "<target> FROM <field> ..."')
def fit(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Union of variables are needed
used_variables = self.feature_variables + [self.target_variable]
# Prepare the dataset
X, y, self.columns = df_util.prepare_features_and_target(
X=X,
variables=used_variables,
target=self.target_variable,
mlspl_limits=options.get('mlspl_limits'),
)
# Return cross_validation scores if kfold_cv is set.
kfolds = options.get('kfold_cv')
if kfolds is not None:
cv_df = algo_util.get_kfold_cross_validation(
estimator=self.estimator,
X=X.values,
y=y.values,
scoring=['r2', 'neg_mean_squared_error'],
kfolds=kfolds,
)
return cv_df
# Fit the estimator
self.estimator.fit(X.values, y.values)
def apply(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Prepare the dataset
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables,
final_columns=self.columns,
mlspl_limits=options.get('mlspl_limits'),
)
# Make predictions
y_hat = self.estimator.predict(X.values)
# Assign output_name
default_name = 'predicted({})'.format(self.target_variable)
new_name = options.get('output_name', None)
output_name = self.rename_output(default_names=default_name, new_names=new_name)
# Create output
output = df_util.create_output_dataframe(
y_hat=y_hat, nans=nans, output_names=output_name
)
# Merge with original dataframe
output = df_util.merge_predictions(df, output)
return output
def rename_output(self, default_names, new_names=None):
"""Utility hook to rename output.
The default behavior is to take the default_names passed in and simply
return them. If however a particular algo needs to rename the columns of
the output, this method can be overridden.
"""
return new_names if new_names is not None else default_names
class ClassifierMixin(object):
"""Defines methods to setup and make categorical predictions.
The ClassifierMixin is useful for supervised learning problems where the
target variable is categorical. One special aspect of the classifier is that
we set the 'classes' attribute on self to keep track of the target_variable's
unique values.
Additional methods defined here are:
- handle_options
- rename_output
See algos/SVM.py for an example of using this mixin.
"""
def handle_options(self, options):
"""Utility to ensure both feature_variables and target_variable exist."""
if (
len(options.get('target_variable', [])) != 1
or len(options.get('feature_variables', [])) == 0
):
raise RuntimeError('Syntax error: expected "<target> FROM <field> ..."')
self.classes = None
def fit(self, df, options):
# Check target variable
df[self.target_variable] = df_util.check_and_convert_target_variable(
df, self.target_variable
)
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Ensure there aren't too many classes
mlspl_limits = options.get('mlspl_limits', {})
max_classes = int(mlspl_limits.get('max_distinct_cat_values_for_classifiers', 100))
df_util.limit_classes_for_classifier(X, self.target_variable, max_classes)
# Use all the variables
used_variables = self.feature_variables + [self.target_variable]
X, y, self.columns = df_util.prepare_features_and_target(
X=X,
variables=used_variables,
target=self.target_variable,
mlspl_limits=mlspl_limits,
)
# Return cross_validation scores if kfold_cv is set.
kfolds = options.get('kfold_cv')
if kfolds is not None:
scoring = ['f1_weighted', 'accuracy', 'precision_weighted', 'recall_weighted']
cv_df = algo_util.get_kfold_cross_validation(
estimator=self.estimator, X=X.values, y=y.values, scoring=scoring, kfolds=kfolds
)
return cv_df
# Fit the estimator
self.estimator.fit(X.values, y.values)
# Save the classes
self.classes = np.unique(y)
def partial_fit(self, df, options):
# Check target variable
df[self.target_variable] = df_util.check_and_convert_target_variable(
df, self.target_variable
)
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Ensure that partial_fit method is defined for the estimator
algo_util.assert_estimator_supports_partial_fit(self.estimator)
# Ensure there aren't too many classes
mlspl_limits = options.get('mlspl_limits', {})
max_classes = int(mlspl_limits.get('max_distinct_cat_values_for_classifiers', 100))
df_util.limit_classes_for_classifier(X, self.target_variable, max_classes)
# Use all the variables
used_variables = self.feature_variables + [self.target_variable]
# Prepare the dataset
X, y, columns = df_util.prepare_features_and_target(
X=X,
variables=used_variables,
target=self.target_variable,
mlspl_limits=mlspl_limits,
)
# On the very first partial call
if self.classes is None:
self.classes = np.unique(y)
self.estimator.partial_fit(X.values, y.values, self.classes)
self.columns = columns
# On subsequent partial_fit calls
else:
X, y = df_util.handle_new_categorical_values(
X, y, options, self.columns, self.classes
)
if not X.empty:
self.estimator.partial_fit(X.values, y.values)
def apply(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Prepare the dataset
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables,
final_columns=self.columns,
mlspl_limits=options.get('mlspl_limits'),
)
# Make predictions
y_hat = self.estimator.predict(X.values)
# Assign output_name
default_name = 'predicted({})'.format(self.target_variable)
new_name = options.get('output_name', None)
output_name = self.rename_output(default_names=default_name, new_names=new_name)
# Create output dataframe
output = df_util.create_output_dataframe(
y_hat=y_hat, nans=nans, output_names=output_name
)
# Merge with original dataframe
output = df_util.merge_predictions(df, output)
return output
def rename_output(self, default_names, new_names=None):
"""Utility hook to rename output.
The default behavior is to take the default_names passed in and simply
return them. If however a particular algo needs to rename the columns of
the output, this method can be overridden.
"""
return new_names if new_names is not None else default_names
class ClustererMixin(object):
"""Defines methods to setup and cluster data.
The ClustererMixin is useful for unsupervised learning problems.
Additional methods defined here are:
- handle_options
- rename_output
See algos/KMeans.py for an example of using this mixin.
"""
def handle_options(self, options):
"""Utility to ensure there are feature_variables and no target_variable."""
if (
len(options.get('feature_variables', [])) == 0
or len(options.get('target_variable', [])) > 0
):
raise RuntimeError('Syntax error: expected "<field> ..."')
def fit(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
X, nans, self.columns = df_util.prepare_features(
X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
)
if nans.any():
# If null values found in the data, warn the user to handle them before fit.
cexc.messages.warn(
"NULL values found in the dataset. Clusters are not assigned for these values currently. "
"Please consider handling null (or missing) entries to get appropriate clustering output."
)
self.estimator.fit(X.values)
def partial_fit(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
algo_util.assert_estimator_supports_partial_fit(self.estimator)
X, _, columns = df_util.prepare_features(
X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
)
if getattr(self, 'columns', None):
df_util.handle_new_categorical_values(X, None, options, self.columns)
if X.empty:
return
else:
self.columns = columns
self.estimator.partial_fit(X)
def apply(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables,
final_columns=self.columns,
mlspl_limits=options.get('mlspl_limits'),
)
y_hat = self.estimator.predict(X.values)
# Ensure the output has no floating points
y_hat = y_hat.astype('str')
# Assign output_name
default_name = 'cluster'
new_name = options.get('output_name', None)
output_name = self.rename_output(default_names=default_name, new_names=new_name)
# Create output dataframe
output = df_util.create_output_dataframe(
y_hat=y_hat, nans=nans, output_names=output_name
)
# Merge with original dataframe
output = df_util.merge_predictions(df, output)
return output
def rename_output(self, default_names, new_names=None):
"""Utility hook to rename output.
The default behavior is to take the default_names passed in and simply
return them. If however a particular algo needs to rename the columns of
the output, this method can be overridden.
"""
return new_names if new_names is not None else default_names
class TransformerMixin(object):
"""Defines methods to setup and arbitrarily transform data.
The TransformerMixin is useful for unsupervised learning problems or
arbitrary data transformations are required.
Additional methods defined here are:
- handle_options
- rename_output
- make_output_names
See algos/PCA.py for an example of using this mixin.
"""
def handle_options(self, options):
"""Utility to ensure there are feature_variables and no target_variable."""
if (
len(options.get('feature_variables', [])) == 0
or len(options.get('target_variable', [])) > 0
):
raise RuntimeError('Syntax error: expected "<field> ..."')
def fit(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Prepare features
X, _, self.columns = df_util.prepare_features(
X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
)
# Fit the estimator
self.estimator.fit(X.values)
def make_output_names(self, output_name=None, n_names=None):
"""Utility to produce default column names for the output."""
if output_name is None:
output_name = str(self.__class__.__name__)
output_names = [
feature + '_' + str(index) for index, feature in enumerate([output_name] * n_names)
]
return output_names
def apply(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Prepare the features
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables,
final_columns=self.columns,
mlspl_limits=options.get('mlspl_limits'),
)
# Call the transform method
y_hat = self.estimator.transform(X.values)
# Assign output_name
output_name = options.get('output_name', None)
default_names = self.make_output_names(output_name=output_name, n_names=y_hat.shape[1])
output_names = self.rename_output(default_names, output_name)
# Create output dataframe
output = df_util.create_output_dataframe(
y_hat=y_hat, nans=nans, output_names=output_names
)
# Merge with original dataframe
output = df_util.merge_predictions(df, output)
return output
def rename_output(self, default_names, new_names=None):
"""Utility hook to rename output.
The default behavior is to take the default_names passed in and simply
return them. If however a particular algo needs to rename the columns of
the output, this method can be overridden.
"""
return new_names if new_names is not None else default_names