You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

716 lines
22 KiB

#!/usr/bin/env python
from collections import OrderedDict
import pandas as pd
import numpy as np
import cexc
logger = cexc.get_logger(__name__)
from .constants import HOWTO_CONFIGURE_MLSPL_LIMITS
def drop_unused_and_missing(X, required_fields):
"""Drop columns that are not *required*, drop rows that have missing values.
Args:
X (dataframe): input dataframe
required_fields (list): required fields
Returns:
X (dataframe): output dataframe, with some columns and rows dropped
nans (np array): boolean array to indicate which rows have missing
values in the original dataframe
"""
drop_unused_fields(X, required_fields)
warn_on_missing_fields(X, required_fields)
drop_na_columns(X)
X, nans = drop_na_rows(X)
return X, nans
def get_cols_with_mixed_type(df):
"""Get list of columns in a DataFrame that do not have uniform types
Args:
df (dataframe): input dataframe
Returns:
list of columns names
"""
def is_mixed_type(df_col):
return df_col.dtype == 'O' and len(df_col.apply(type).unique()) > 1
return [col for col in df.columns if is_mixed_type(df[col])]
def mixed_type_cols_to_string(mixed_type_cols, df):
"""Convert columns with non-uniform types to strings
Args:
mixed_type_cols (list): list of names of columns containing mixed types
df (dataframe): input dataframe
Returns:
modified dataframe
"""
for col in mixed_type_cols:
df[col] = df[col].apply(str)
return df
def prepare_features(X, variables, final_columns=None, get_dummies=True, mlspl_limits=None):
"""Prepare features.
This method defines conventional steps to prepare features:
- drop unused columns
- drop rows that have missing values
- optionally (if get_dummies==True)
- convert categorical fields into indicator dummy variables
- optionally (if final_column is provided)
- make the resulting dataframe match final_columns
Args:
X (dataframe): input dataframe
variables (list): column names
final_columns (list): finalized column names
get_dummies (bool): indicate if categorical variable should be converted
mlspl_limits (dict): a dictionary containing values from mlspl conf
Returns:
X (dataframe): prepared feature dataframe
nans (np array): boolean array to indicate which rows have missing
values in the original dataframe
columns (list): sorted list of feature column names
"""
if mlspl_limits is None:
mlspl_limits = {}
max_distinct_cat_values = int(mlspl_limits.get('max_distinct_cat_values', 100))
X, nans = drop_unused_and_missing(X, variables)
if get_dummies:
filter_non_numeric(X, max_distinct_cat_values)
mixed_type_cols = get_cols_with_mixed_type(X)
X = mixed_type_cols_to_string(mixed_type_cols, X)
X = pd.get_dummies(X, prefix_sep='=', sparse=False)
if final_columns is not None:
drop_unused_fields(X, final_columns)
assert_any_fields(X)
fill_missing_fields(X, final_columns)
assert_any_rows(X)
assert_any_fields(X)
columns = sort_fields(X)
return (X, nans, columns)
def prepare_features_and_target(X, variables, target, mlspl_limits=None):
"""Prepare features and target.
This method defines conventional steps to prepare features and target:
- drop unused columns
- drop rows that have missing values
- split dataframe into features and target
- convert categorical variables into indicator dummy variables
Args:
X (dataframe): input dataframe
variables (list): column names
target (str): column name
Returns:
X (dataframe): prepared feature dataframe
y (pd series): prepared target pandas series
columns (list): sorted list of feature column names
"""
if mlspl_limits is None:
mlspl_limits = {}
max_distinct_cat_values = int(mlspl_limits.get('max_distinct_cat_values', 100))
X, _ = drop_unused_and_missing(X, variables)
X, y = split_features_and_target(X, target)
filter_non_numeric(X, max_distinct_cat_values)
mixed_type_cols = get_cols_with_mixed_type(X)
X = mixed_type_cols_to_string(mixed_type_cols, X)
X = pd.get_dummies(X, prefix_sep='=', sparse=False)
assert_any_fields(X)
assert_any_rows(X)
columns = sort_fields(X)
return (X, y, columns)
def create_output_dataframe(y_hat, nans, output_names, shape=None):
"""Create output dataframe.
This method defines steps to create output dataframe:
- initialize an empty dataframe according to
- given list of column names
- given or inferred shape
- populate the dataframe with y_hat and fill in nan for no predictions
Args:
y_hat (numpy array or list): predictions
nans (np array): boolean array to indicate which rows have missing
values in the original dataframe
output_names (str or list): columns names for output dataframe
shape (tuple): shape for the output dataframe
Returns:
output (dataframe): output dataframe
"""
if shape is None:
# If we pass multiple output_names in a list,
# the width is necessarily > 1
if type(output_names) is list:
shape = (len(nans), len(output_names))
else:
# MLA-1450: Flatten the result when y_hat.ndim == 2 due to sklearn bug.
# https://github.com/scikit-learn/scikit-learn/issues/5058
# y_hat can be dataframe, series or list.
if type(y_hat) is not list and y_hat.ndim == 2:
y_hat = y_hat.reshape(-1)
shape = len(nans)
columns = output_names if type(output_names) is list else [output_names]
output = pd.DataFrame(columns=columns, data=np.zeros(shape))
output[output_names] = np.nan
output.loc[output.index[~nans], output_names] = y_hat
return output
def merge_predictions(original_df, additional_df):
"""Merge two dataframes.
Args:
original_df (dataframe): first dataframe
additional_df (dataframe): second dataframe
Returns:
merged_df (dataframe): merged dataframe
"""
assert original_df.index.size == additional_df.index.size
# We need to make sure the index on both DataFrames are indexed
# similarly so the join is performed correctly
if additional_df.index[0] != original_df.index[0]:
additional_df.index = original_df.index
dupes = additional_df.columns.intersection(original_df.columns)
original_df.drop(columns=dupes, inplace=True)
return pd.concat([original_df, additional_df], axis=1, join='inner')
def get_unseen_value_behavior(options):
"""Load options for handling new values in categorical fields.
Args:
options (dict): options
Returns:
handle_new_cat (str): the choice to handle new values
"""
mlspl_limits = options.get('mlspl_limits', {})
handle_new_cat = mlspl_limits.get('handle_new_cat', 'default')
if 'params' in options:
if options['params'].get('unseen_value', []):
handle_new_cat = options['params']['unseen_value']
del options['params']['unseen_value']
return handle_new_cat
def handle_new_categorical_values(X, y, options, columns, classes=None):
"""Handle new/unseen categorical values.
Categorical variables are usually converted to indicator dummy variables.
Models with incremental fit capability will save the total number of
features produced. This method defines what to do when a new indicator
variable is created from a previously unseen categorical value.
Args:
X (dataframe): feature dataframe
y (pd series): target series
options (dict): options
columns (list): column names
classes (np array): unique class labels in the target
Returns:
X (dataframe): feature dataframe
y (pd series): target series
"""
handle_new_cat = get_unseen_value_behavior(options)
action_unseen = {'stop', 'default', 'skip'}
try:
assert handle_new_cat in action_unseen
except AssertionError:
raise Exception('Invalid value for "unseen_value": %s' % handle_new_cat)
# Fill in empty columns if input has fewer categorical values than the ones
# the existing model was trained with
fill_missing_fields(X, columns)
if handle_new_cat == 'skip':
# remove rows containing new categorical values in X
new_cat_ind_X, new_cat_cols = get_indicies_of_unseen_categorical_values(X, columns)
if len(new_cat_ind_X) > 0:
X, y = skip_unseen_categorical_values(X, y, new_cat_ind_X, new_cat_cols)
# remove rows containing new categorical values in y
if classes is not None:
X, y = skip_unseen_target_values(X, y, classes)
elif handle_new_cat == 'default':
# set columns that corresponds to new categorical value(s) to 0 for applicable rows
new_cat_ind_X, new_cat_cols = get_indicies_of_unseen_categorical_values(X, columns)
if len(new_cat_ind_X) > 0:
X.drop(new_cat_cols, axis=1, inplace=True)
cexc.messages.warn(
'Columns correspond to unseen categorical explanatory variable value(s): %s are omitted'
% new_cat_cols
)
# remove rows containing new categorical values in y
if classes is not None:
X, y = skip_unseen_target_values(X, y, classes)
else:
# stops when encountering rows containing new categorical values (X or y)
new_col_in_X = np.setdiff1d(X.columns, columns)
if len(new_col_in_X) > 0:
raise RuntimeError(
'New categorical value for explanatory variables in training data: %s'
% new_col_in_X
)
if classes is not None:
new_class_in_y = np.setdiff1d(y, classes)
if len(new_class_in_y) > 0:
raise RuntimeError('New target values in training data: %s' % new_class_in_y)
return X, y
def filter_non_numeric(df, max_values=100):
"""Filter out non-numeric columns with too many unique factors.
Args:
df (dataframe): input dataframe
max_values (int): maximum number of values to allow
Returns:
df (dataframe): output dataframe
"""
drop_cols = []
convert_cols = []
scols = list(df.dtypes[df.dtypes == 'object'].index)
# TODO: Profile this loop.
for scol in scols:
if df[scol].nunique() > max_values:
drop_cols.append(scol)
else:
convert_cols.append(scol)
if len(drop_cols) > 0:
cexc.messages.warn(
'Dropping field(s) with too many distinct values: {}. {}'.format(
', '.join(drop_cols), HOWTO_CONFIGURE_MLSPL_LIMITS
)
)
df.drop(drop_cols, inplace=True, axis=1)
num_converted_cols = len(convert_cols)
log_msg = 'Converting field(s) with categorical values into categorical fields: {}'.format(
', '.join(convert_cols)
)
if num_converted_cols > 0:
msg = (
log_msg
if num_converted_cols <= 2
else (
'Converting {} field(s) with categorical values into categorical fields. Please see mlspl.log for details.'.format(
num_converted_cols
)
)
)
cexc.messages.warn(msg)
logger.warn(log_msg)
if len(df.columns) == 0:
raise RuntimeError('No valid fields to fit or apply model to.')
return df
def assert_field_present(df, field):
"""Make sure field is present.
Args:
df (dataframe): input dataframe
field (str): column name
Raises:
Exception
"""
if field not in df:
raise Exception('Field "%s" not present.' % field)
def limit_classes_for_classifier(df, field, max_values=100):
"""Limit the number of categories for classifiers.
Args:
df (dataframe): input dataframe
field (str): column name
max_values (int): the upper limit for the cardinality of the target
Returns:
df (dataframe): output dataframe
"""
assert_field_present(df, field)
n = df[field].nunique()
if n > max_values:
raise Exception(
'Field "{}" has too many distinct values: {} (max {}). {}'.format(
field, n, max_values, HOWTO_CONFIGURE_MLSPL_LIMITS
)
)
nans = df[field].isnull()
df.loc[nans, field] = np.nan
return df
def drop_unused_fields(df, requested_fields):
"""Drop fields the user didn't ask for.
Args:
df (dataframe): input dataframe
requested_fields (list): column names
Returns:
df (dataframe): output dataframe
"""
drop_cols = set(df.columns).difference(requested_fields)
df.drop(drop_cols, inplace=True, axis=1)
return df
def warn_on_missing_fields(df, requested_fields):
"""Raise user-visible warning for missing fields.
Args:
df (dataframe): input dataframe
requested_fields (list): column names
"""
missing_columns = set(requested_fields).difference(df.columns)
if len(missing_columns) > 0:
cexc.messages.warn('Missing field(s): %s', ', '.join(missing_columns))
def split_features_and_target(df, target_variable):
"""Split features and target in input dataframe and return.
Args:
df (dataframe): input dataframe
target_variable (str): column name
Returns:
features (dataframe): feature dataframe
target (pandas series): target series
"""
assert_field_present(df, target_variable)
target = df.pop(target_variable)
features = df
assert_any_fields(features)
return features, target
def drop_na_columns(df):
"""Drop columns where all values are missing/null.
Args:
df (dataframe): input dataframe
Returns:
df (dataframe): output dataframe, with na columns dropped
"""
start_columns = df.columns
df.dropna(axis=1, how='all', inplace=True)
end_columns = df.columns
drop_cols = set(start_columns).difference(end_columns)
if len(drop_cols) > 0:
cexc.messages.warn('Dropped field(s) with all null values: %s', ', '.join(drop_cols))
return df
def drop_na_rows(df):
"""Drop rows that have missing values.
Args:
df (dataframe): input dataframe
Returns:
df (dataframe): output dataframe, with na rows dropped
nans (np array): boolean array to indicate which rows have missing
values in the original dataframe
"""
nans = df.isnull().any(axis=1).values
df.dropna(axis=0, how='any', inplace=True)
return df, nans
def assert_any_fields(df):
"""Make sure there are valid field(s).
Args:
df (dataframe): input dataframe
Raises:
RuntimeError
"""
if len(df.columns) == 0:
raise RuntimeError('No valid fields to fit or apply model to.')
def assert_any_rows(df):
"""Make sure there are valid row(s).
Args:
df (dataframe): input dataframe
Raises:
RuntimeError
"""
if len(df) == 0:
raise RuntimeError(
'No valid events; check for null or non-numeric values in numeric fields'
)
def sort_fields(df):
"""Sort dataframe by column.
Args:
df (dataframe): input dataframe
Returns:
(list): list of columns
"""
df.sort_index(inplace=True, axis=1)
return list(df.columns)
def fill_missing_fields(df, requested_fields):
"""Fill missing fields with 0's.
Args:
df (dataframe): input dataframe
requested_fields (list): column names
Returns:
df (dataframe): output dataframe
"""
missing_fields = set(requested_fields).difference(set(df.columns))
if len(missing_fields) > 0:
cexc.logger.debug('Filling missing fields(s): %s', ', '.join(missing_fields))
for col in missing_fields:
df[col] = 0
return df
def apply_in_chunks(df, func, n=1000, options=None):
"""Make prediction chunk by chunk.
Args:
df (dataframe): input dataframe
func (callable): function that defines apply behavior
n (int): number of rows per chunk
options (dict): options
Returns:
df (dataframe): output dataframe
"""
def bechunk(df_, n_):
return [df_[i : i + n_] for i in range(0, len(df_), n_)]
dfs = [func(x.reset_index(drop=True), options) for x in bechunk(df, n)]
df = pd.DataFrame()
df = df.append(dfs).copy()
df.reset_index(drop=True, inplace=True)
return df
def get_indicies_of_unseen_categorical_values(X, columns):
"""Find the X axis indices (row numbers) where new values are present.
Args:
X (dataframe): feature dataframe
columns (list): column names
Returns:
new_cat_idx (np array): row indices
new_categorical_columns (np array): new categorical columns
"""
new_categorical_columns = np.setdiff1d(X.columns, columns)
if len(new_categorical_columns) == 0:
return (new_categorical_columns, None)
new_cat_idx = np.where(X[new_categorical_columns].any(axis=1).values)[0]
return new_cat_idx, new_categorical_columns
def skip_unseen_categorical_values(X, y, row_idx, new_categorical_columns):
"""Remove rows with unseen categorical value(s) from X.
Args:
X (dataframe): feature dataframe
y (pd series): target series
row_idx (np array): row indices
new_categorical_columns (np array): new categorical columns
Returns:
X (dataframe): feature dataframe
y (pd series): target series
"""
X.drop(row_idx, axis=0, inplace=True)
X.drop(new_categorical_columns, axis=1, inplace=True)
if y is not None:
y.drop(row_idx, axis=0, inplace=True)
cexc.messages.warn(
"Some events containing unseen categorical feature values have been skipped while updating the model."
)
return X, y
def skip_unseen_target_values(X, y, classes):
"""Remove unseen categorical values from y. Also remove rows which
corresponds to removed y values from X.
Args:
X (dataframe): feature dataframe
y (pd series): target series
classes (np array): unique classes
Returns:
X (dataframe): feature dataframe
y (pd series): target series
"""
new_cat_cols_y = np.setdiff1d(np.unique(y), classes)
if len(new_cat_cols_y) == 0:
return X, y
# Only keep rows where y is a previously seen class
seen_categorical_mask = y.isin(classes)
X = X[seen_categorical_mask]
y = y[seen_categorical_mask]
cexc.messages.warn(
'Some events containing unseen categorical target values have been skipped while updating the model.'
)
return X, y
def check_and_convert_target_variable(df, field):
"""Check target variable is present, then convert non-null values to str.
If the target variable is int value but float type, cast to int before
converting to str, since pandas promotes int type series to float type
when nans is present.
Args:
df (dataframe): input dataframe
field (str): target field name
Returns:
y (pd series): target series
"""
assert_field_present(df, field)
y = df[field]
y_notnull = y[y.notnull()]
if y.dtype.kind == 'f' and np.all(y_notnull == y_notnull.astype(int)):
y.loc[y_notnull.index] = y_notnull.astype(int).astype(str)
else:
y.loc[y_notnull.index] = y_notnull.astype(str)
return y
def verify_columns_are_categorical(df, fields):
"""Verify the fields are present in input dataframe and checks that fields are
only string, boolean or integer and raises an error otherwise.
Args:
df: (dataframe): input dataframe
fields (list): list of field names to split by
Returns:
None if no error is raised.
Raises:
- Error when the fields are not present in the df
- Error when elements in dataset does not fall in any of the categories in allowed_types
"""
for field in fields:
assert_field_present(df, field)
allowed_types = (
int,
bool,
str,
np.bool_,
np.object_,
np.int8,
np.int16,
np.int32,
np.int64,
np.uint8,
np.uint16,
np.uint32,
np.uint64,
)
for field, dtype in df[fields].dtypes.items():
if dtype not in allowed_types:
raise RuntimeError(
'This operation allows only string, boolean, and integer values while '
'field {} is none of these types.'.format(field)
)
def split_by(df, fields, max_groups=None):
"""Split the rows in input dataframe into groups
given a set of fields. Optionally check that the
number of resulted groups does not exceed a number.
NOTE: this function verifies that fields used in split
operation are only string, boolean or integer and raises an
error otherwise.
Args:
df: (dataframe): input dataframe
fields (list): list of field names to split by
max_groups (int): maximum allowed groups. If the number of resulted
groups is larger than this number, an error is raised
Returns:
pandas.DataFrameGroupBy: The resulting groups
Raises:
- Error when the fields are not present in the df
- Error when elements in dataset does not fall in any of the categories in allowed_types
"""
verify_columns_are_categorical(df, fields)
groups = df.groupby(fields, as_index=False)
if max_groups and len(groups) > max_groups:
raise RuntimeError(
'The number of groups can not exceed {}; the current number of groups is {}. '
'Please find detailed information about the number of groups in docs.'.format(
max_groups, len(groups)
)
)
return groups
def remove_duplicates(fields):
"""Remove duplicate elements in fields list.
Args:
fields (list): list of field names
Returns:
list of distinct field names
"""
return list(OrderedDict.fromkeys(fields))
def is_empty_df(df):
return len(df) == 0