You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
716 lines
22 KiB
716 lines
22 KiB
#!/usr/bin/env python
|
|
from collections import OrderedDict
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
import cexc
|
|
|
|
logger = cexc.get_logger(__name__)
|
|
|
|
from .constants import HOWTO_CONFIGURE_MLSPL_LIMITS
|
|
|
|
|
|
def drop_unused_and_missing(X, required_fields):
|
|
"""Drop columns that are not *required*, drop rows that have missing values.
|
|
|
|
Args:
|
|
X (dataframe): input dataframe
|
|
required_fields (list): required fields
|
|
|
|
Returns:
|
|
X (dataframe): output dataframe, with some columns and rows dropped
|
|
nans (np array): boolean array to indicate which rows have missing
|
|
values in the original dataframe
|
|
"""
|
|
drop_unused_fields(X, required_fields)
|
|
warn_on_missing_fields(X, required_fields)
|
|
drop_na_columns(X)
|
|
X, nans = drop_na_rows(X)
|
|
return X, nans
|
|
|
|
|
|
def get_cols_with_mixed_type(df):
|
|
"""Get list of columns in a DataFrame that do not have uniform types
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
|
|
Returns:
|
|
list of columns names
|
|
"""
|
|
|
|
def is_mixed_type(df_col):
|
|
return df_col.dtype == 'O' and len(df_col.apply(type).unique()) > 1
|
|
|
|
return [col for col in df.columns if is_mixed_type(df[col])]
|
|
|
|
|
|
def mixed_type_cols_to_string(mixed_type_cols, df):
|
|
"""Convert columns with non-uniform types to strings
|
|
Args:
|
|
mixed_type_cols (list): list of names of columns containing mixed types
|
|
df (dataframe): input dataframe
|
|
|
|
Returns:
|
|
modified dataframe
|
|
"""
|
|
for col in mixed_type_cols:
|
|
df[col] = df[col].apply(str)
|
|
return df
|
|
|
|
|
|
def prepare_features(X, variables, final_columns=None, get_dummies=True, mlspl_limits=None):
|
|
"""Prepare features.
|
|
|
|
This method defines conventional steps to prepare features:
|
|
- drop unused columns
|
|
- drop rows that have missing values
|
|
- optionally (if get_dummies==True)
|
|
- convert categorical fields into indicator dummy variables
|
|
- optionally (if final_column is provided)
|
|
- make the resulting dataframe match final_columns
|
|
|
|
Args:
|
|
X (dataframe): input dataframe
|
|
variables (list): column names
|
|
final_columns (list): finalized column names
|
|
get_dummies (bool): indicate if categorical variable should be converted
|
|
mlspl_limits (dict): a dictionary containing values from mlspl conf
|
|
|
|
Returns:
|
|
X (dataframe): prepared feature dataframe
|
|
nans (np array): boolean array to indicate which rows have missing
|
|
values in the original dataframe
|
|
columns (list): sorted list of feature column names
|
|
"""
|
|
if mlspl_limits is None:
|
|
mlspl_limits = {}
|
|
|
|
max_distinct_cat_values = int(mlspl_limits.get('max_distinct_cat_values', 100))
|
|
|
|
X, nans = drop_unused_and_missing(X, variables)
|
|
if get_dummies:
|
|
filter_non_numeric(X, max_distinct_cat_values)
|
|
mixed_type_cols = get_cols_with_mixed_type(X)
|
|
X = mixed_type_cols_to_string(mixed_type_cols, X)
|
|
X = pd.get_dummies(X, prefix_sep='=', sparse=False)
|
|
if final_columns is not None:
|
|
drop_unused_fields(X, final_columns)
|
|
assert_any_fields(X)
|
|
fill_missing_fields(X, final_columns)
|
|
assert_any_rows(X)
|
|
assert_any_fields(X)
|
|
columns = sort_fields(X)
|
|
return (X, nans, columns)
|
|
|
|
|
|
def prepare_features_and_target(X, variables, target, mlspl_limits=None):
|
|
"""Prepare features and target.
|
|
|
|
This method defines conventional steps to prepare features and target:
|
|
- drop unused columns
|
|
- drop rows that have missing values
|
|
- split dataframe into features and target
|
|
- convert categorical variables into indicator dummy variables
|
|
|
|
Args:
|
|
X (dataframe): input dataframe
|
|
variables (list): column names
|
|
target (str): column name
|
|
|
|
Returns:
|
|
X (dataframe): prepared feature dataframe
|
|
y (pd series): prepared target pandas series
|
|
columns (list): sorted list of feature column names
|
|
"""
|
|
if mlspl_limits is None:
|
|
mlspl_limits = {}
|
|
|
|
max_distinct_cat_values = int(mlspl_limits.get('max_distinct_cat_values', 100))
|
|
|
|
X, _ = drop_unused_and_missing(X, variables)
|
|
X, y = split_features_and_target(X, target)
|
|
filter_non_numeric(X, max_distinct_cat_values)
|
|
mixed_type_cols = get_cols_with_mixed_type(X)
|
|
X = mixed_type_cols_to_string(mixed_type_cols, X)
|
|
X = pd.get_dummies(X, prefix_sep='=', sparse=False)
|
|
assert_any_fields(X)
|
|
assert_any_rows(X)
|
|
columns = sort_fields(X)
|
|
return (X, y, columns)
|
|
|
|
|
|
def create_output_dataframe(y_hat, nans, output_names, shape=None):
|
|
"""Create output dataframe.
|
|
|
|
This method defines steps to create output dataframe:
|
|
- initialize an empty dataframe according to
|
|
- given list of column names
|
|
- given or inferred shape
|
|
- populate the dataframe with y_hat and fill in nan for no predictions
|
|
|
|
Args:
|
|
y_hat (numpy array or list): predictions
|
|
nans (np array): boolean array to indicate which rows have missing
|
|
values in the original dataframe
|
|
output_names (str or list): columns names for output dataframe
|
|
shape (tuple): shape for the output dataframe
|
|
|
|
Returns:
|
|
output (dataframe): output dataframe
|
|
"""
|
|
if shape is None:
|
|
# If we pass multiple output_names in a list,
|
|
# the width is necessarily > 1
|
|
if type(output_names) is list:
|
|
shape = (len(nans), len(output_names))
|
|
else:
|
|
# MLA-1450: Flatten the result when y_hat.ndim == 2 due to sklearn bug.
|
|
# https://github.com/scikit-learn/scikit-learn/issues/5058
|
|
# y_hat can be dataframe, series or list.
|
|
if type(y_hat) is not list and y_hat.ndim == 2:
|
|
y_hat = y_hat.reshape(-1)
|
|
shape = len(nans)
|
|
columns = output_names if type(output_names) is list else [output_names]
|
|
output = pd.DataFrame(columns=columns, data=np.zeros(shape))
|
|
output[output_names] = np.nan
|
|
output.loc[output.index[~nans], output_names] = y_hat
|
|
return output
|
|
|
|
|
|
def merge_predictions(original_df, additional_df):
|
|
"""Merge two dataframes.
|
|
|
|
Args:
|
|
original_df (dataframe): first dataframe
|
|
additional_df (dataframe): second dataframe
|
|
|
|
Returns:
|
|
merged_df (dataframe): merged dataframe
|
|
"""
|
|
assert original_df.index.size == additional_df.index.size
|
|
# We need to make sure the index on both DataFrames are indexed
|
|
# similarly so the join is performed correctly
|
|
if additional_df.index[0] != original_df.index[0]:
|
|
additional_df.index = original_df.index
|
|
dupes = additional_df.columns.intersection(original_df.columns)
|
|
original_df.drop(columns=dupes, inplace=True)
|
|
return pd.concat([original_df, additional_df], axis=1, join='inner')
|
|
|
|
|
|
def get_unseen_value_behavior(options):
|
|
"""Load options for handling new values in categorical fields.
|
|
|
|
Args:
|
|
options (dict): options
|
|
|
|
Returns:
|
|
handle_new_cat (str): the choice to handle new values
|
|
"""
|
|
mlspl_limits = options.get('mlspl_limits', {})
|
|
handle_new_cat = mlspl_limits.get('handle_new_cat', 'default')
|
|
|
|
if 'params' in options:
|
|
if options['params'].get('unseen_value', []):
|
|
handle_new_cat = options['params']['unseen_value']
|
|
del options['params']['unseen_value']
|
|
|
|
return handle_new_cat
|
|
|
|
|
|
def handle_new_categorical_values(X, y, options, columns, classes=None):
|
|
"""Handle new/unseen categorical values.
|
|
|
|
Categorical variables are usually converted to indicator dummy variables.
|
|
Models with incremental fit capability will save the total number of
|
|
features produced. This method defines what to do when a new indicator
|
|
variable is created from a previously unseen categorical value.
|
|
|
|
Args:
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
options (dict): options
|
|
columns (list): column names
|
|
classes (np array): unique class labels in the target
|
|
|
|
Returns:
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
"""
|
|
handle_new_cat = get_unseen_value_behavior(options)
|
|
action_unseen = {'stop', 'default', 'skip'}
|
|
try:
|
|
assert handle_new_cat in action_unseen
|
|
except AssertionError:
|
|
raise Exception('Invalid value for "unseen_value": %s' % handle_new_cat)
|
|
|
|
# Fill in empty columns if input has fewer categorical values than the ones
|
|
# the existing model was trained with
|
|
fill_missing_fields(X, columns)
|
|
if handle_new_cat == 'skip':
|
|
# remove rows containing new categorical values in X
|
|
new_cat_ind_X, new_cat_cols = get_indicies_of_unseen_categorical_values(X, columns)
|
|
if len(new_cat_ind_X) > 0:
|
|
X, y = skip_unseen_categorical_values(X, y, new_cat_ind_X, new_cat_cols)
|
|
# remove rows containing new categorical values in y
|
|
if classes is not None:
|
|
X, y = skip_unseen_target_values(X, y, classes)
|
|
|
|
elif handle_new_cat == 'default':
|
|
# set columns that corresponds to new categorical value(s) to 0 for applicable rows
|
|
new_cat_ind_X, new_cat_cols = get_indicies_of_unseen_categorical_values(X, columns)
|
|
if len(new_cat_ind_X) > 0:
|
|
X.drop(new_cat_cols, axis=1, inplace=True)
|
|
cexc.messages.warn(
|
|
'Columns correspond to unseen categorical explanatory variable value(s): %s are omitted'
|
|
% new_cat_cols
|
|
)
|
|
# remove rows containing new categorical values in y
|
|
if classes is not None:
|
|
X, y = skip_unseen_target_values(X, y, classes)
|
|
|
|
else:
|
|
# stops when encountering rows containing new categorical values (X or y)
|
|
new_col_in_X = np.setdiff1d(X.columns, columns)
|
|
if len(new_col_in_X) > 0:
|
|
raise RuntimeError(
|
|
'New categorical value for explanatory variables in training data: %s'
|
|
% new_col_in_X
|
|
)
|
|
if classes is not None:
|
|
new_class_in_y = np.setdiff1d(y, classes)
|
|
if len(new_class_in_y) > 0:
|
|
raise RuntimeError('New target values in training data: %s' % new_class_in_y)
|
|
|
|
return X, y
|
|
|
|
|
|
def filter_non_numeric(df, max_values=100):
|
|
"""Filter out non-numeric columns with too many unique factors.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
max_values (int): maximum number of values to allow
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe
|
|
"""
|
|
drop_cols = []
|
|
convert_cols = []
|
|
scols = list(df.dtypes[df.dtypes == 'object'].index)
|
|
# TODO: Profile this loop.
|
|
for scol in scols:
|
|
if df[scol].nunique() > max_values:
|
|
drop_cols.append(scol)
|
|
else:
|
|
convert_cols.append(scol)
|
|
|
|
if len(drop_cols) > 0:
|
|
cexc.messages.warn(
|
|
'Dropping field(s) with too many distinct values: {}. {}'.format(
|
|
', '.join(drop_cols), HOWTO_CONFIGURE_MLSPL_LIMITS
|
|
)
|
|
)
|
|
df.drop(drop_cols, inplace=True, axis=1)
|
|
|
|
num_converted_cols = len(convert_cols)
|
|
log_msg = 'Converting field(s) with categorical values into categorical fields: {}'.format(
|
|
', '.join(convert_cols)
|
|
)
|
|
if num_converted_cols > 0:
|
|
msg = (
|
|
log_msg
|
|
if num_converted_cols <= 2
|
|
else (
|
|
'Converting {} field(s) with categorical values into categorical fields. Please see mlspl.log for details.'.format(
|
|
num_converted_cols
|
|
)
|
|
)
|
|
)
|
|
cexc.messages.warn(msg)
|
|
logger.warn(log_msg)
|
|
|
|
if len(df.columns) == 0:
|
|
raise RuntimeError('No valid fields to fit or apply model to.')
|
|
|
|
return df
|
|
|
|
|
|
def assert_field_present(df, field):
|
|
"""Make sure field is present.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
field (str): column name
|
|
|
|
Raises:
|
|
Exception
|
|
"""
|
|
if field not in df:
|
|
raise Exception('Field "%s" not present.' % field)
|
|
|
|
|
|
def limit_classes_for_classifier(df, field, max_values=100):
|
|
"""Limit the number of categories for classifiers.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
field (str): column name
|
|
max_values (int): the upper limit for the cardinality of the target
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe
|
|
"""
|
|
assert_field_present(df, field)
|
|
n = df[field].nunique()
|
|
if n > max_values:
|
|
raise Exception(
|
|
'Field "{}" has too many distinct values: {} (max {}). {}'.format(
|
|
field, n, max_values, HOWTO_CONFIGURE_MLSPL_LIMITS
|
|
)
|
|
)
|
|
nans = df[field].isnull()
|
|
df.loc[nans, field] = np.nan
|
|
return df
|
|
|
|
|
|
def drop_unused_fields(df, requested_fields):
|
|
"""Drop fields the user didn't ask for.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
requested_fields (list): column names
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe
|
|
"""
|
|
drop_cols = set(df.columns).difference(requested_fields)
|
|
df.drop(drop_cols, inplace=True, axis=1)
|
|
return df
|
|
|
|
|
|
def warn_on_missing_fields(df, requested_fields):
|
|
"""Raise user-visible warning for missing fields.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
requested_fields (list): column names
|
|
"""
|
|
missing_columns = set(requested_fields).difference(df.columns)
|
|
if len(missing_columns) > 0:
|
|
cexc.messages.warn('Missing field(s): %s', ', '.join(missing_columns))
|
|
|
|
|
|
def split_features_and_target(df, target_variable):
|
|
"""Split features and target in input dataframe and return.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
target_variable (str): column name
|
|
|
|
Returns:
|
|
features (dataframe): feature dataframe
|
|
target (pandas series): target series
|
|
"""
|
|
assert_field_present(df, target_variable)
|
|
target = df.pop(target_variable)
|
|
features = df
|
|
assert_any_fields(features)
|
|
return features, target
|
|
|
|
|
|
def drop_na_columns(df):
|
|
"""Drop columns where all values are missing/null.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe, with na columns dropped
|
|
"""
|
|
start_columns = df.columns
|
|
df.dropna(axis=1, how='all', inplace=True)
|
|
end_columns = df.columns
|
|
drop_cols = set(start_columns).difference(end_columns)
|
|
if len(drop_cols) > 0:
|
|
cexc.messages.warn('Dropped field(s) with all null values: %s', ', '.join(drop_cols))
|
|
return df
|
|
|
|
|
|
def drop_na_rows(df):
|
|
"""Drop rows that have missing values.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe, with na rows dropped
|
|
nans (np array): boolean array to indicate which rows have missing
|
|
values in the original dataframe
|
|
"""
|
|
nans = df.isnull().any(axis=1).values
|
|
df.dropna(axis=0, how='any', inplace=True)
|
|
return df, nans
|
|
|
|
|
|
def assert_any_fields(df):
|
|
"""Make sure there are valid field(s).
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
|
|
Raises:
|
|
RuntimeError
|
|
"""
|
|
if len(df.columns) == 0:
|
|
raise RuntimeError('No valid fields to fit or apply model to.')
|
|
|
|
|
|
def assert_any_rows(df):
|
|
"""Make sure there are valid row(s).
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
|
|
Raises:
|
|
RuntimeError
|
|
"""
|
|
if len(df) == 0:
|
|
raise RuntimeError(
|
|
'No valid events; check for null or non-numeric values in numeric fields'
|
|
)
|
|
|
|
|
|
def sort_fields(df):
|
|
"""Sort dataframe by column.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
|
|
Returns:
|
|
(list): list of columns
|
|
"""
|
|
df.sort_index(inplace=True, axis=1)
|
|
return list(df.columns)
|
|
|
|
|
|
def fill_missing_fields(df, requested_fields):
|
|
"""Fill missing fields with 0's.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
requested_fields (list): column names
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe
|
|
"""
|
|
missing_fields = set(requested_fields).difference(set(df.columns))
|
|
if len(missing_fields) > 0:
|
|
cexc.logger.debug('Filling missing fields(s): %s', ', '.join(missing_fields))
|
|
for col in missing_fields:
|
|
df[col] = 0
|
|
return df
|
|
|
|
|
|
def apply_in_chunks(df, func, n=1000, options=None):
|
|
"""Make prediction chunk by chunk.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
func (callable): function that defines apply behavior
|
|
n (int): number of rows per chunk
|
|
options (dict): options
|
|
|
|
Returns:
|
|
df (dataframe): output dataframe
|
|
"""
|
|
|
|
def bechunk(df_, n_):
|
|
return [df_[i : i + n_] for i in range(0, len(df_), n_)]
|
|
|
|
dfs = [func(x.reset_index(drop=True), options) for x in bechunk(df, n)]
|
|
|
|
df = pd.DataFrame()
|
|
df = df.append(dfs).copy()
|
|
df.reset_index(drop=True, inplace=True)
|
|
|
|
return df
|
|
|
|
|
|
def get_indicies_of_unseen_categorical_values(X, columns):
|
|
"""Find the X axis indices (row numbers) where new values are present.
|
|
|
|
Args:
|
|
X (dataframe): feature dataframe
|
|
columns (list): column names
|
|
|
|
Returns:
|
|
new_cat_idx (np array): row indices
|
|
new_categorical_columns (np array): new categorical columns
|
|
"""
|
|
new_categorical_columns = np.setdiff1d(X.columns, columns)
|
|
if len(new_categorical_columns) == 0:
|
|
return (new_categorical_columns, None)
|
|
new_cat_idx = np.where(X[new_categorical_columns].any(axis=1).values)[0]
|
|
return new_cat_idx, new_categorical_columns
|
|
|
|
|
|
def skip_unseen_categorical_values(X, y, row_idx, new_categorical_columns):
|
|
"""Remove rows with unseen categorical value(s) from X.
|
|
|
|
Args:
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
row_idx (np array): row indices
|
|
new_categorical_columns (np array): new categorical columns
|
|
|
|
Returns:
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
"""
|
|
X.drop(row_idx, axis=0, inplace=True)
|
|
X.drop(new_categorical_columns, axis=1, inplace=True)
|
|
if y is not None:
|
|
y.drop(row_idx, axis=0, inplace=True)
|
|
cexc.messages.warn(
|
|
"Some events containing unseen categorical feature values have been skipped while updating the model."
|
|
)
|
|
return X, y
|
|
|
|
|
|
def skip_unseen_target_values(X, y, classes):
|
|
"""Remove unseen categorical values from y. Also remove rows which
|
|
corresponds to removed y values from X.
|
|
|
|
Args:
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
classes (np array): unique classes
|
|
|
|
Returns:
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
"""
|
|
new_cat_cols_y = np.setdiff1d(np.unique(y), classes)
|
|
if len(new_cat_cols_y) == 0:
|
|
return X, y
|
|
|
|
# Only keep rows where y is a previously seen class
|
|
seen_categorical_mask = y.isin(classes)
|
|
X = X[seen_categorical_mask]
|
|
y = y[seen_categorical_mask]
|
|
|
|
cexc.messages.warn(
|
|
'Some events containing unseen categorical target values have been skipped while updating the model.'
|
|
)
|
|
return X, y
|
|
|
|
|
|
def check_and_convert_target_variable(df, field):
|
|
"""Check target variable is present, then convert non-null values to str.
|
|
If the target variable is int value but float type, cast to int before
|
|
converting to str, since pandas promotes int type series to float type
|
|
when nans is present.
|
|
|
|
Args:
|
|
df (dataframe): input dataframe
|
|
field (str): target field name
|
|
|
|
Returns:
|
|
y (pd series): target series
|
|
"""
|
|
assert_field_present(df, field)
|
|
y = df[field]
|
|
y_notnull = y[y.notnull()]
|
|
if y.dtype.kind == 'f' and np.all(y_notnull == y_notnull.astype(int)):
|
|
y.loc[y_notnull.index] = y_notnull.astype(int).astype(str)
|
|
else:
|
|
y.loc[y_notnull.index] = y_notnull.astype(str)
|
|
return y
|
|
|
|
|
|
def verify_columns_are_categorical(df, fields):
|
|
"""Verify the fields are present in input dataframe and checks that fields are
|
|
only string, boolean or integer and raises an error otherwise.
|
|
|
|
Args:
|
|
df: (dataframe): input dataframe
|
|
fields (list): list of field names to split by
|
|
Returns:
|
|
None if no error is raised.
|
|
Raises:
|
|
- Error when the fields are not present in the df
|
|
- Error when elements in dataset does not fall in any of the categories in allowed_types
|
|
|
|
"""
|
|
for field in fields:
|
|
assert_field_present(df, field)
|
|
|
|
allowed_types = (
|
|
int,
|
|
bool,
|
|
str,
|
|
np.bool_,
|
|
np.object_,
|
|
np.int8,
|
|
np.int16,
|
|
np.int32,
|
|
np.int64,
|
|
np.uint8,
|
|
np.uint16,
|
|
np.uint32,
|
|
np.uint64,
|
|
)
|
|
for field, dtype in df[fields].dtypes.items():
|
|
if dtype not in allowed_types:
|
|
raise RuntimeError(
|
|
'This operation allows only string, boolean, and integer values while '
|
|
'field {} is none of these types.'.format(field)
|
|
)
|
|
|
|
|
|
def split_by(df, fields, max_groups=None):
|
|
"""Split the rows in input dataframe into groups
|
|
given a set of fields. Optionally check that the
|
|
number of resulted groups does not exceed a number.
|
|
NOTE: this function verifies that fields used in split
|
|
operation are only string, boolean or integer and raises an
|
|
error otherwise.
|
|
|
|
Args:
|
|
df: (dataframe): input dataframe
|
|
fields (list): list of field names to split by
|
|
max_groups (int): maximum allowed groups. If the number of resulted
|
|
groups is larger than this number, an error is raised
|
|
Returns:
|
|
pandas.DataFrameGroupBy: The resulting groups
|
|
Raises:
|
|
- Error when the fields are not present in the df
|
|
- Error when elements in dataset does not fall in any of the categories in allowed_types
|
|
|
|
"""
|
|
verify_columns_are_categorical(df, fields)
|
|
groups = df.groupby(fields, as_index=False)
|
|
if max_groups and len(groups) > max_groups:
|
|
raise RuntimeError(
|
|
'The number of groups can not exceed {}; the current number of groups is {}. '
|
|
'Please find detailed information about the number of groups in docs.'.format(
|
|
max_groups, len(groups)
|
|
)
|
|
)
|
|
return groups
|
|
|
|
|
|
def remove_duplicates(fields):
|
|
"""Remove duplicate elements in fields list.
|
|
|
|
Args:
|
|
fields (list): list of field names
|
|
Returns:
|
|
list of distinct field names
|
|
"""
|
|
return list(OrderedDict.fromkeys(fields))
|
|
|
|
|
|
def is_empty_df(df):
|
|
return len(df) == 0
|