You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
460 lines
18 KiB
460 lines
18 KiB
from collections import OrderedDict
|
|
from sklearn.impute import SimpleImputer as _Imputer
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from base import BaseAlgo, TransformerMixin
|
|
from codec import codecs_manager
|
|
from util.param_util import convert_params
|
|
from util.df_util import assert_any_fields, assert_any_rows, warn_on_missing_fields
|
|
|
|
import cexc
|
|
|
|
|
|
WARN_FIELDS_LIMIT = 20
|
|
|
|
messages = cexc.get_messages_logger()
|
|
|
|
|
|
def is_nan(x):
|
|
"""Check if input is "NaN".
|
|
|
|
Args:
|
|
x (numeric or string): any string or numeric data
|
|
|
|
Returns:
|
|
bool
|
|
"""
|
|
return x is np.nan or str(x).lower() == 'nan'
|
|
|
|
|
|
def is_all_nan(series):
|
|
"""Check if series is all "NaN".
|
|
|
|
Args:
|
|
series (pandas.core.series.Series)
|
|
|
|
Returns:
|
|
bool
|
|
"""
|
|
return series.apply(is_nan).all()
|
|
|
|
|
|
def is_any_nan(series):
|
|
"""Check if there are any "NaN" in series.
|
|
|
|
Args:
|
|
series (pandas.core.series.Series)
|
|
|
|
Returns:
|
|
bool
|
|
"""
|
|
return series.apply(is_nan).any()
|
|
|
|
|
|
def not_missing(df, missing):
|
|
"""Return True wherever `df` values are not `missing`, else False.
|
|
|
|
Args:
|
|
df (pandas.DataFrame)
|
|
missing (string or integer): value that represents missing data
|
|
|
|
Returns:
|
|
pandas.DataFrame: DataFrame with values that are False where
|
|
the corresponding value in `df` is missing (i.e., matches `missing`)
|
|
and True otherwise.
|
|
"""
|
|
tt = type(missing)
|
|
if tt is int:
|
|
return df != missing
|
|
if missing is np.NaN or (tt is str and missing.lower() == "nan"):
|
|
return ~df.applymap(is_nan)
|
|
# This case shouldn't happen as Imputer is currently defined:
|
|
if tt is str:
|
|
return ~df.applymap(lambda x: str(x) == missing)
|
|
assert False, 'Never reach'
|
|
|
|
|
|
class Imputer(TransformerMixin, BaseAlgo):
|
|
"""Instance of Imputer algorithm to fill in missing values in data."""
|
|
|
|
SKLEARN_STRATEGIES = ['mean', 'median', 'most_frequent']
|
|
|
|
def __init__(self, options):
|
|
"""Initialize instance of Imputer.
|
|
|
|
Args:
|
|
options (dict): contains SPL arguments passed to
|
|
`...|<fit|apply|summary> Imputer`
|
|
|
|
Returns:
|
|
Imputer: instance of Imputer
|
|
|
|
Raises:
|
|
RuntimeError:
|
|
- Invalid value for "missing_values" parameter: must be
|
|
integer or "NaN" (case-insensitive).
|
|
- Invalid value for "strategy" parameter: must be one
|
|
of ["mean", "median", "most_frequent", "field"].
|
|
"""
|
|
valid_strategies = Imputer.SKLEARN_STRATEGIES + ['field']
|
|
|
|
# Check that the user supplied one or more feature_variables and no
|
|
# target_variable. If these conditions are not met, `handle_options`
|
|
# raises an error.
|
|
self.handle_options(options)
|
|
|
|
# Convert parameters to the correct types for Imputer.
|
|
params = convert_params(
|
|
options.get('params', {}), strs=['missing_values', 'strategy', 'field']
|
|
)
|
|
|
|
# Check if the user supplied a valid `missing_values` (either integer
|
|
# or 'NaN', as required by sklearn), otherwise use sklearn's default.
|
|
self.missing_values = 'NaN'
|
|
if 'missing_values' in params:
|
|
params['missing_values'] = Imputer.cast_to_nan_or_int(params['missing_values'])
|
|
self.missing_values = params['missing_values']
|
|
|
|
# Check if the user supplied a valid `strategy`, otherwise use
|
|
# sklearn's default strategy of "mean".
|
|
self.strategy = params.get('strategy', 'mean')
|
|
if self.strategy not in valid_strategies:
|
|
err_msg = 'Invalid value for "strategy" parameter: must be one of {{{}}}.'.format(
|
|
', '.join(valid_strategies)
|
|
)
|
|
raise RuntimeError(err_msg)
|
|
if self.strategy == 'field' and 'field' not in params:
|
|
err_msg = 'You must specify the "field" parameter when using the "field" strategy.'
|
|
raise RuntimeError(err_msg)
|
|
|
|
self.field = params.get('field')
|
|
if 'field' in params and self.strategy != 'field':
|
|
err_msg = 'The parameter "field" can only be used with strategy="field".'
|
|
raise RuntimeError(err_msg)
|
|
|
|
# Store input column names and initialize Imputer. The
|
|
# `feature_variables` attribute is also assigned in
|
|
# `get_relevant_fields` in the FitBatchProcessor. If the instance does
|
|
# not have the `feature_variables` attribute (e.g.,
|
|
# `get_relevant_fields` is not run) then `feature_variables` will not
|
|
# be updated appropriately by the processor when '*' is used (see
|
|
# `match_and_assign_variables`).
|
|
self.feature_variables = options['feature_variables']
|
|
self._params = params
|
|
self.estimator = None
|
|
|
|
def _init_estimator_and_fix_input(self, df):
|
|
"""Initialize the estimator and modify the input
|
|
according to the strategy"""
|
|
|
|
# Scikit Learn does not support strategy="field", so we can't initialize
|
|
# it with that parameter setting.
|
|
if self.strategy not in Imputer.SKLEARN_STRATEGIES:
|
|
return
|
|
# If `missing_values` is a variant of NaN (i.e.,
|
|
# np.nan, 'NaN, or 'nan' then we need to setup the
|
|
# sklearn's SimpleImputer differently. In particular,
|
|
# if the strategy is 'mean' or 'median', SimpleImputer
|
|
# will convert data to float, so the passed missing_values
|
|
# to SimpleImputer must be np.nan. If the strategy is
|
|
# 'most_frequent' then the passed missing_values
|
|
# must be 'nan' but since we want the SimpleImputer
|
|
# to impute both 'nan' and 'NaN' values then
|
|
# we need to replace 'NaN' with 'nan' in the input data.
|
|
missing_values = self._params.get('missing_values')
|
|
if (
|
|
missing_values
|
|
and missing_values is np.nan
|
|
or (type(missing_values) == str and missing_values.lower() == 'nan')
|
|
):
|
|
if self.strategy != 'most_frequent':
|
|
self._params['missing_values'] = np.NaN
|
|
else:
|
|
for field in self.imputable:
|
|
# Replace 'NaN's with 'nan's
|
|
df[field].where(df[field] != 'NaN', 'nan', inplace=True)
|
|
self._params['missing_values'] = 'nan'
|
|
if self.estimator:
|
|
return
|
|
self.estimator = _Imputer(**self._params)
|
|
|
|
@staticmethod
|
|
def truncate_warn_fields(fields, limit=WARN_FIELDS_LIMIT, truncidx=3):
|
|
"""Cast field list to string, truncating if the length is greater than `limit`.
|
|
|
|
Args:
|
|
fields (list of strings): the list of fields to pring
|
|
limit (integer): the max length to print before truncating
|
|
|
|
Returns:
|
|
string: list of fields as string (possibly truncated)
|
|
"""
|
|
if len(fields) > limit:
|
|
return ', '.join(fields[:truncidx]) + '...' + ', '.join(fields[-truncidx:])
|
|
else:
|
|
return ', '.join(fields)
|
|
|
|
@staticmethod
|
|
def cast_to_nan_or_int(val):
|
|
"""Cast input to valid "missing_values" value: "NaN" or integer.
|
|
|
|
Args:
|
|
val: value to cast
|
|
|
|
Returns:
|
|
string or integer: "NaN" or integer
|
|
|
|
Raises:
|
|
RuntimeError: Invalid value for "missing_values" parameter: must be
|
|
integer or "NaN" (case-insensitive).
|
|
"""
|
|
if val.lower() == 'nan':
|
|
return np.NaN
|
|
try:
|
|
return int(val)
|
|
except:
|
|
err_msg = (
|
|
'Invalid value for "missing_values" parameter:'
|
|
' must be integer or "NaN" (case-insensitive).'
|
|
)
|
|
raise RuntimeError(err_msg)
|
|
|
|
def get_imputable_fields(self, df):
|
|
"""Return subset of input fields that can be imputed, not dropped.
|
|
|
|
`sklearn.preprocessing.Imputer` drops columns whose values are all
|
|
missing, or columns that contain NaN (case-insensitive "NaN" or np.nan)
|
|
when `strategy="mean"` or `strategy="median"` and `missing_values` is
|
|
an integer. Therefore, don't impute those, otherwise there is a
|
|
mismatch between the number of output column names and the number of
|
|
output columns when trying to name the output columns later.
|
|
|
|
Args:
|
|
self (Imputer): instance of Imputer
|
|
df (pandas.DataFrame): the input data
|
|
|
|
Returns:
|
|
list: list of fields that are imputable
|
|
"""
|
|
fields_present = [f for f in self.feature_variables if f in df.columns]
|
|
fields_present = list(OrderedDict.fromkeys(fields_present)) # deduplicate
|
|
|
|
# Warn about input fields that are not missing any data.
|
|
none_missing = ~(df[fields_present] == self.missing_values).any()
|
|
contains_nan = df[fields_present].apply(is_any_nan)
|
|
fields_copied = df[fields_present].columns[none_missing & ~contains_nan]
|
|
warn_fields = Imputer.truncate_warn_fields(fields_copied)
|
|
if warn_fields:
|
|
warn_msg = (
|
|
'The fields {} are not missing any data. '
|
|
'Imputed fields will be a copy.'.format(warn_fields)
|
|
)
|
|
messages.warn(warn_msg)
|
|
|
|
# No fields are dropped when strategy="field".
|
|
if self.strategy == 'field':
|
|
return df[fields_present].columns.tolist()
|
|
|
|
# sklearn.preprocessing.Imputer drops fields with all missing values.
|
|
# This applies when strategy is "mean", "median", or "most_frequent".
|
|
all_missing = (df[fields_present] == self.missing_values).all()
|
|
all_nan = df[fields_present].apply(is_all_nan)
|
|
cant_impute = all_missing | all_nan
|
|
warn_fields = Imputer.truncate_warn_fields(df[fields_present].columns[cant_impute])
|
|
if warn_fields:
|
|
warn_msg = 'Cannot impute fields {} with strategy={}: all values missing.'.format(
|
|
warn_fields, self.strategy
|
|
)
|
|
messages.warn(warn_msg)
|
|
|
|
# sklearn.preprocessing.Imputer drops fields containing NaNs when the
|
|
# missing_values parameter is integer and strategy is "mean" or "median".
|
|
if isinstance(self.missing_values, int) and self.strategy in ["mean", "median"]:
|
|
warn_fields = Imputer.truncate_warn_fields(
|
|
df[fields_present].columns[contains_nan & ~cant_impute]
|
|
)
|
|
if warn_fields:
|
|
warn_msg = (
|
|
'Cannot impute {}: "contains "NaN" and "missing_values" is integer.'.format(
|
|
warn_fields
|
|
)
|
|
)
|
|
messages.warn(warn_msg)
|
|
cant_impute = cant_impute | contains_nan
|
|
|
|
return df[fields_present].columns[~cant_impute].tolist()
|
|
|
|
def _impute(self, df, fn, replacements):
|
|
"""Internal method to impute the missing values of `df`.
|
|
|
|
This method imputes using either the `fn` (when "strategy" is "mean",
|
|
"median", or "most_frequent") or `replacements` (when "strategy" is
|
|
"field"). Either `fn` or `replacements` must be not None.
|
|
|
|
`sklearn.preprocessing.Imputer` drops columns whose values are all
|
|
missing, or columns that contain NaN (case-insensitive "NaN" or np.nan)
|
|
when `strategy="mean"` or `strategy="median"` and `missing_values` is
|
|
an integer. Therefore, the DataFrame returned may be missing these
|
|
dropped columns in these cases.
|
|
|
|
Args:
|
|
self (Imputer): instance of Imputer
|
|
df (pandas.DataFrame): the input data
|
|
fn (instancemethod): `sklearn.preprocessing.Imputer.fit` or
|
|
`sklearn.preprocessing.Imputer.fit_transform`
|
|
replacements (pandas.core.series.Series): the series containing the
|
|
replacement values
|
|
|
|
Returns:
|
|
pandas.DataFrame: df with missing values replaced (and possibly with
|
|
some columns dropped, see method docstring) or None (if fn and
|
|
replacements are both None)
|
|
|
|
Raises:
|
|
RuntimeError: Cannot impute string data.
|
|
"""
|
|
if fn is not None:
|
|
try:
|
|
return fn(df)
|
|
except ValueError as e: # Use a more helpful error message.
|
|
if 'could not convert string to float' in str(e):
|
|
err_msg = 'Cannot impute string data.'
|
|
raise RuntimeError(err_msg)
|
|
else:
|
|
raise e
|
|
if replacements is not None:
|
|
return df.where(
|
|
not_missing(df, self.missing_values), other=replacements, axis=0
|
|
).values
|
|
|
|
def _fit_or_apply(self, df, options):
|
|
"""Internal method to compute fill values and add imputed field.
|
|
|
|
Called by `fit` or `apply`, which passes in `fn`. Then this calls `fn`,
|
|
which should be `sklearn.preprocessing.Imputer.fit_transform` or
|
|
`sklearn.preprocessing.Imputer.transform`.
|
|
|
|
Args:
|
|
df (pandas.DataFrame): the input data
|
|
options (dict): the SPL arguments supplied to `fit`
|
|
|
|
Returns:
|
|
pandas.DataFrame: the original DataFrame concatenated with
|
|
additional imputed columns
|
|
"""
|
|
# By default, sklearn.preprocessing.Imputer makes a copy of the input
|
|
# dataframe, so it is not necessary to make a copy to avoid overwrite.
|
|
|
|
# The value passed to the `as` clause gets put in "output_name". If the
|
|
# field with that name already exists (e.g., "output_name" is "") then
|
|
# the field with that name is overwritten.
|
|
output_prefix = options.get('output_name', 'Imputed')
|
|
|
|
# Make sure input fields (feature_variables) are in the dataset.
|
|
assert_any_fields(df)
|
|
assert_any_rows(df)
|
|
warn_on_missing_fields(df, self.feature_variables)
|
|
fields_present = [f for f in self.feature_variables if f in df.columns]
|
|
|
|
if self.strategy == 'field':
|
|
if self.field not in df.columns:
|
|
err_msg = 'Cannot impute using {}: field not present in dataset.'.format(
|
|
self.field
|
|
)
|
|
raise RuntimeError(err_msg)
|
|
|
|
field_map = OrderedDict([(f'{output_prefix}_{f}', f) for f in fields_present])
|
|
|
|
self.imputable = self.get_imputable_fields(df)
|
|
|
|
self._init_estimator_and_fix_input(df)
|
|
fn = self.estimator.fit_transform if self.estimator else None
|
|
|
|
replacements = df[self.field] if self.field is not None else None
|
|
if self.imputable:
|
|
self.imputed = [k for (k, v) in field_map.items() if v in self.imputable]
|
|
y_hat = self._impute(df[self.imputable], fn, replacements)
|
|
y_hat = pd.DataFrame(y_hat, columns=self.imputed)
|
|
|
|
# If the field was imputable, then the imputed version should be used.
|
|
# Otherwise, use the original field.
|
|
assignments = {}
|
|
for imp, orig in field_map.items():
|
|
if orig in self.imputable:
|
|
assignments[imp] = y_hat[imp]
|
|
else: # dropped
|
|
assignments[imp] = df[field_map[imp]]
|
|
|
|
# This returns a copy of df with imputed fields appended.
|
|
return df.assign(**assignments)
|
|
|
|
def fit(self, df, options):
|
|
"""Call `fit_transform` to compute fill values and add imputed fields.
|
|
|
|
If fields that are not in the dataset are passed in to impute, these
|
|
are ignored. If passed-in fields are:
|
|
- all `missing_values` or all not a number (case-insensitive "NaN"
|
|
or all `np.nan`, or
|
|
- contain "NaN" (case-insensitive or `np.nan`) when `strategy` is
|
|
"mean" or "median",
|
|
then the "imputed" versions of these fields are just copies of the
|
|
originals, including missing values. Otherwise they are copies with
|
|
missing values imputed.
|
|
|
|
Args:
|
|
df (pandas.DataFrame): the input data
|
|
options (dict): the SPL arguments supplied to `fit`
|
|
|
|
Returns:
|
|
pandas.DataFrame: the original DataFrame concatenated with
|
|
additional imputed columns
|
|
"""
|
|
return self._fit_or_apply(df, options)
|
|
|
|
def apply(self, df, options):
|
|
"""Call `transform` to add imputed field with pre-computed fill values.
|
|
|
|
Assumes `fit` has already been called. See `fit` docstring for cases
|
|
when fields are copied rather than imputed.
|
|
|
|
Args:
|
|
df (pandas.DataFrame): the input data
|
|
options (dict): the SPL arguments supplied to `apply`
|
|
|
|
Returns:
|
|
pandas.DataFrame: the original DataFrame concatenated with
|
|
additional imputed columns
|
|
"""
|
|
return self._fit_or_apply(df, options)
|
|
|
|
def summary(self, options):
|
|
"""Provide the imputation fill values for each imputed field.
|
|
|
|
Args:
|
|
options (dict): the SPL arguments supplied to `summary`
|
|
|
|
Returns:
|
|
pandas.DataFrame: contains for each input field, the name of the
|
|
imputed field, and the imputation fill value
|
|
"""
|
|
statistics = 'Values in {}'.format(self.field)
|
|
if self.estimator is not None and hasattr(self.estimator, "statistics_"):
|
|
statistics = self.estimator.statistics_
|
|
return pd.DataFrame(
|
|
{
|
|
'imputable_field': self.imputable,
|
|
'imputed_field': self.imputed,
|
|
'imputed_value': statistics,
|
|
'imputation_strategy': self.strategy,
|
|
}
|
|
)
|
|
|
|
@staticmethod
|
|
def register_codecs():
|
|
from codec.codecs import SimpleObjectCodec
|
|
|
|
codecs_manager.add_codec('algos.Imputer', 'Imputer', SimpleObjectCodec)
|
|
codecs_manager.add_codec('sklearn.impute._base', 'SimpleImputer', SimpleObjectCodec)
|