Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/util/df_util.py

#!/usr/bin/env python
from collections import OrderedDict
import pandas as pd
import numpy as np

import cexc

logger = cexc.get_logger(__name__)

from .constants import HOWTO_CONFIGURE_MLSPL_LIMITS


def drop_unused_and_missing(X, required_fields):
    """Drop columns that are not *required*, drop rows that have missing values.

    Args:
        X (dataframe): input dataframe
        required_fields (list): required fields

    Returns:
        X (dataframe): output dataframe, with some columns and rows dropped
        nans (np array): boolean array to indicate which rows have missing
            values in the original dataframe
    """
    drop_unused_fields(X, required_fields)
    warn_on_missing_fields(X, required_fields)
    drop_na_columns(X)
    X, nans = drop_na_rows(X)
    return X, nans


def get_cols_with_mixed_type(df):
    """Get list of columns in a DataFrame that do not have uniform types
    Args:
        df (dataframe): input dataframe

    Returns:
        list of columns names
    """

    def is_mixed_type(df_col):
        return df_col.dtype == 'O' and len(df_col.apply(type).unique()) > 1

    return [col for col in df.columns if is_mixed_type(df[col])]


def mixed_type_cols_to_string(mixed_type_cols, df):
    """Convert columns with non-uniform types to strings
    Args:
        mixed_type_cols (list): list of names of columns containing mixed types
        df (dataframe): input dataframe

    Returns:
        modified dataframe
    """
    for col in mixed_type_cols:
        df[col] = df[col].apply(str)
    return df


def prepare_features(X, variables, final_columns=None, get_dummies=True, mlspl_limits=None):
    """Prepare features.

    This method defines conventional steps to prepare features:
        - drop unused columns
        - drop rows that have missing values
        - optionally (if get_dummies==True)
            - convert categorical fields into indicator dummy variables
        - optionally (if final_column is provided)
            - make the resulting dataframe match final_columns

    Args:
        X (dataframe): input dataframe
        variables (list): column names
        final_columns (list): finalized column names
        get_dummies (bool): indicate if categorical variable should be converted
        mlspl_limits (dict): a dictionary containing values from mlspl conf

    Returns:
        X (dataframe): prepared feature dataframe
        nans (np array): boolean array to indicate which rows have missing
            values in the original dataframe
        columns (list): sorted list of feature column names
    """
    if mlspl_limits is None:
        mlspl_limits = {}

    max_distinct_cat_values = int(mlspl_limits.get('max_distinct_cat_values', 100))

    X, nans = drop_unused_and_missing(X, variables)
    if get_dummies:
        filter_non_numeric(X, max_distinct_cat_values)
        mixed_type_cols = get_cols_with_mixed_type(X)
        X = mixed_type_cols_to_string(mixed_type_cols, X)
        X = pd.get_dummies(X, prefix_sep='=', sparse=False)
    if final_columns is not None:
        drop_unused_fields(X, final_columns)
        assert_any_fields(X)
        fill_missing_fields(X, final_columns)
    assert_any_rows(X)
    assert_any_fields(X)
    columns = sort_fields(X)
    return (X, nans, columns)


def prepare_features_and_target(X, variables, target, mlspl_limits=None):
    """Prepare features and target.

    This method defines conventional steps to prepare features and target:
        - drop unused columns
        - drop rows that have missing values
        - split dataframe into features and target
        - convert categorical variables into indicator dummy variables

    Args:
        X (dataframe): input dataframe
        variables (list): column names
        target (str): column name

    Returns:
        X (dataframe): prepared feature dataframe
        y (pd series): prepared target pandas series
        columns (list): sorted list of feature column names
    """
    if mlspl_limits is None:
        mlspl_limits = {}

    max_distinct_cat_values = int(mlspl_limits.get('max_distinct_cat_values', 100))

    X, _ = drop_unused_and_missing(X, variables)
    X, y = split_features_and_target(X, target)
    filter_non_numeric(X, max_distinct_cat_values)
    mixed_type_cols = get_cols_with_mixed_type(X)
    X = mixed_type_cols_to_string(mixed_type_cols, X)
    X = pd.get_dummies(X, prefix_sep='=', sparse=False)
    assert_any_fields(X)
    assert_any_rows(X)
    columns = sort_fields(X)
    return (X, y, columns)


def create_output_dataframe(y_hat, nans, output_names, shape=None):
    """Create output dataframe.

    This method defines steps to create output dataframe:
        - initialize an empty dataframe according to
            - given list of column names
            - given or inferred shape
        - populate the dataframe with y_hat and fill in nan for no predictions

    Args:
        y_hat (numpy array or list): predictions
        nans (np array): boolean array to indicate which rows have missing
            values in the original dataframe
        output_names (str or list): columns names for output dataframe
        shape (tuple): shape for the output dataframe

    Returns:
        output (dataframe): output dataframe
    """
    if shape is None:
        # If we pass multiple output_names in a list,
        # the width is necessarily > 1
        if type(output_names) is list:
            shape = (len(nans), len(output_names))
        else:
            # MLA-1450: Flatten the result when y_hat.ndim == 2 due to sklearn bug.
            # https://github.com/scikit-learn/scikit-learn/issues/5058
            # y_hat can be dataframe, series or list.
            if type(y_hat) is not list and y_hat.ndim == 2:
                y_hat = y_hat.reshape(-1)
            shape = len(nans)
    columns = output_names if type(output_names) is list else [output_names]
    output = pd.DataFrame(columns=columns, data=np.zeros(shape))
    output[output_names] = np.nan
    output.loc[output.index[~nans], output_names] = y_hat
    return output


def merge_predictions(original_df, additional_df):
    """Merge two dataframes.

    Args:
        original_df (dataframe): first dataframe
        additional_df (dataframe): second dataframe

    Returns:
        merged_df (dataframe): merged dataframe
    """
    assert original_df.index.size == additional_df.index.size
    # We need to make sure the index on both DataFrames are indexed
    # similarly so the join is performed correctly
    if additional_df.index[0] != original_df.index[0]:
        additional_df.index = original_df.index
    dupes = additional_df.columns.intersection(original_df.columns)
    original_df.drop(columns=dupes, inplace=True)
    return pd.concat([original_df, additional_df], axis=1, join='inner')


def get_unseen_value_behavior(options):
    """Load options for handling new values in categorical fields.

    Args:
        options (dict): options

    Returns:
        handle_new_cat (str): the choice to handle new values
    """
    mlspl_limits = options.get('mlspl_limits', {})
    handle_new_cat = mlspl_limits.get('handle_new_cat', 'default')

    if 'params' in options:
        if options['params'].get('unseen_value', []):
            handle_new_cat = options['params']['unseen_value']
            del options['params']['unseen_value']

    return handle_new_cat


def handle_new_categorical_values(X, y, options, columns, classes=None):
    """Handle new/unseen categorical values.

    Categorical variables are usually converted to indicator dummy variables.
    Models with incremental fit capability will save the total number of
    features produced. This method defines what to do when a new indicator
    variable is created from a previously unseen categorical value.

    Args:
        X (dataframe): feature dataframe
        y (pd series): target series
        options (dict): options
        columns (list): column names
        classes (np array): unique class labels in the target

    Returns:
        X (dataframe): feature dataframe
        y (pd series): target series
    """
    handle_new_cat = get_unseen_value_behavior(options)
    action_unseen = {'stop', 'default', 'skip'}
    try:
        assert handle_new_cat in action_unseen
    except AssertionError:
        raise Exception('Invalid value for "unseen_value": %s' % handle_new_cat)

    # Fill in empty columns if input has fewer categorical values than the ones
    # the existing model was trained with
    fill_missing_fields(X, columns)
    if handle_new_cat == 'skip':
        # remove rows containing new categorical values in X
        new_cat_ind_X, new_cat_cols = get_indicies_of_unseen_categorical_values(X, columns)
        if len(new_cat_ind_X) > 0:
            X, y = skip_unseen_categorical_values(X, y, new_cat_ind_X, new_cat_cols)
        # remove rows containing new categorical values in y
        if classes is not None:
            X, y = skip_unseen_target_values(X, y, classes)

    elif handle_new_cat == 'default':
        # set columns that corresponds to new categorical value(s) to 0 for applicable rows
        new_cat_ind_X, new_cat_cols = get_indicies_of_unseen_categorical_values(X, columns)
        if len(new_cat_ind_X) > 0:
            X.drop(new_cat_cols, axis=1, inplace=True)
            cexc.messages.warn(
                'Columns correspond to unseen categorical explanatory variable value(s): %s are omitted'
                % new_cat_cols
            )
        # remove rows containing new categorical values in y
        if classes is not None:
            X, y = skip_unseen_target_values(X, y, classes)

    else:
        # stops when encountering rows containing new categorical values (X or y)
        new_col_in_X = np.setdiff1d(X.columns, columns)
        if len(new_col_in_X) > 0:
            raise RuntimeError(
                'New categorical value for explanatory variables in training data: %s'
                % new_col_in_X
            )
        if classes is not None:
            new_class_in_y = np.setdiff1d(y, classes)
            if len(new_class_in_y) > 0:
                raise RuntimeError('New target values in training data: %s' % new_class_in_y)

    return X, y


def filter_non_numeric(df, max_values=100):
    """Filter out non-numeric columns with too many unique factors.

    Args:
        df (dataframe): input dataframe
        max_values (int): maximum number of values to allow

    Returns:
        df (dataframe): output dataframe
    """
    drop_cols = []
    convert_cols = []
    scols = list(df.dtypes[df.dtypes == 'object'].index)
    # TODO: Profile this loop.
    for scol in scols:
        if df[scol].nunique() > max_values:
            drop_cols.append(scol)
        else:
            convert_cols.append(scol)

    if len(drop_cols) > 0:
        cexc.messages.warn(
            'Dropping field(s) with too many distinct values: {}. {}'.format(
                ', '.join(drop_cols), HOWTO_CONFIGURE_MLSPL_LIMITS
            )
        )
        df.drop(drop_cols, inplace=True, axis=1)

    num_converted_cols = len(convert_cols)
    log_msg = 'Converting field(s) with categorical values into categorical fields: {}'.format(
        ', '.join(convert_cols)
    )
    if num_converted_cols > 0:
        msg = (
            log_msg
            if num_converted_cols <= 2
            else (
                'Converting {} field(s) with categorical values into categorical fields. Please see mlspl.log for details.'.format(
                    num_converted_cols
                )
            )
        )
        cexc.messages.warn(msg)
        logger.warn(log_msg)

    if len(df.columns) == 0:
        raise RuntimeError('No valid fields to fit or apply model to.')

    return df


def assert_field_present(df, field):
    """Make sure field is present.

    Args:
        df (dataframe): input dataframe
        field (str): column name

    Raises:
        Exception
    """
    if field not in df:
        raise Exception('Field "%s" not present.' % field)


def limit_classes_for_classifier(df, field, max_values=100):
    """Limit the number of categories for classifiers.

    Args:
        df (dataframe): input dataframe
        field (str): column name
        max_values (int): the upper limit for the cardinality of the target

    Returns:
        df (dataframe): output dataframe
    """
    assert_field_present(df, field)
    n = df[field].nunique()
    if n > max_values:
        raise Exception(
            'Field "{}" has too many distinct values: {} (max {}). {}'.format(
                field, n, max_values, HOWTO_CONFIGURE_MLSPL_LIMITS
            )
        )
    nans = df[field].isnull()
    df.loc[nans, field] = np.nan
    return df


def drop_unused_fields(df, requested_fields):
    """Drop fields the user didn't ask for.

    Args:
        df (dataframe): input dataframe
        requested_fields (list): column names

    Returns:
        df (dataframe): output dataframe
    """
    drop_cols = set(df.columns).difference(requested_fields)
    df.drop(drop_cols, inplace=True, axis=1)
    return df


def warn_on_missing_fields(df, requested_fields):
    """Raise user-visible warning for missing fields.

    Args:
        df (dataframe): input dataframe
        requested_fields (list): column names
    """
    missing_columns = set(requested_fields).difference(df.columns)
    if len(missing_columns) > 0:
        cexc.messages.warn('Missing field(s): %s', ', '.join(missing_columns))


def split_features_and_target(df, target_variable):
    """Split features and target in input dataframe and return.

    Args:
        df (dataframe): input dataframe
        target_variable (str): column name

    Returns:
        features (dataframe): feature dataframe
        target (pandas series): target series
    """
    assert_field_present(df, target_variable)
    target = df.pop(target_variable)
    features = df
    assert_any_fields(features)
    return features, target


def drop_na_columns(df):
    """Drop columns where all values are missing/null.

    Args:
        df (dataframe): input dataframe

    Returns:
        df (dataframe): output dataframe, with na columns dropped
    """
    start_columns = df.columns
    df.dropna(axis=1, how='all', inplace=True)
    end_columns = df.columns
    drop_cols = set(start_columns).difference(end_columns)
    if len(drop_cols) > 0:
        cexc.messages.warn('Dropped field(s) with all null values: %s', ', '.join(drop_cols))
    return df


def drop_na_rows(df):
    """Drop rows that have missing values.

    Args:
        df (dataframe): input dataframe

    Returns:
        df (dataframe): output dataframe, with na rows dropped
        nans (np array): boolean array to indicate which rows have missing
            values in the original dataframe
    """
    nans = df.isnull().any(axis=1).values
    df.dropna(axis=0, how='any', inplace=True)
    return df, nans


def assert_any_fields(df):
    """Make sure there are valid field(s).

    Args:
        df (dataframe): input dataframe

    Raises:
        RuntimeError
    """
    if len(df.columns) == 0:
        raise RuntimeError('No valid fields to fit or apply model to.')


def assert_any_rows(df):
    """Make sure there are valid row(s).

    Args:
        df (dataframe): input dataframe

    Raises:
        RuntimeError
    """
    if len(df) == 0:
        raise RuntimeError(
            'No valid events; check for null or non-numeric values in numeric fields'
        )


def sort_fields(df):
    """Sort dataframe by column.

    Args:
        df (dataframe): input dataframe

    Returns:
        (list): list of columns
    """
    df.sort_index(inplace=True, axis=1)
    return list(df.columns)


def fill_missing_fields(df, requested_fields):
    """Fill missing fields with 0's.

    Args:
        df (dataframe): input dataframe
        requested_fields (list): column names

    Returns:
        df (dataframe): output dataframe
    """
    missing_fields = set(requested_fields).difference(set(df.columns))
    if len(missing_fields) > 0:
        cexc.logger.debug('Filling missing fields(s): %s', ', '.join(missing_fields))
        for col in missing_fields:
            df[col] = 0
    return df


def apply_in_chunks(df, func, n=1000, options=None):
    """Make prediction chunk by chunk.

    Args:
        df (dataframe): input dataframe
        func (callable): function that defines apply behavior
        n (int): number of rows per chunk
        options (dict): options

    Returns:
        df (dataframe): output dataframe
    """

    def bechunk(df_, n_):
        return [df_[i : i + n_] for i in range(0, len(df_), n_)]

    dfs = [func(x.reset_index(drop=True), options) for x in bechunk(df, n)]

    df = pd.DataFrame()
    df = df.append(dfs).copy()
    df.reset_index(drop=True, inplace=True)

    return df


def get_indicies_of_unseen_categorical_values(X, columns):
    """Find the X axis indices (row numbers) where new values are present.

    Args:
        X (dataframe): feature dataframe
        columns (list): column names

    Returns:
        new_cat_idx (np array): row indices
        new_categorical_columns (np array): new categorical columns
    """
    new_categorical_columns = np.setdiff1d(X.columns, columns)
    if len(new_categorical_columns) == 0:
        return (new_categorical_columns, None)
    new_cat_idx = np.where(X[new_categorical_columns].any(axis=1).values)[0]
    return new_cat_idx, new_categorical_columns


def skip_unseen_categorical_values(X, y, row_idx, new_categorical_columns):
    """Remove rows with unseen categorical value(s) from X.

    Args:
        X (dataframe): feature dataframe
        y (pd series): target series
        row_idx (np array): row indices
        new_categorical_columns (np array): new categorical columns

    Returns:
        X (dataframe): feature dataframe
        y (pd series): target series
    """
    X.drop(row_idx, axis=0, inplace=True)
    X.drop(new_categorical_columns, axis=1, inplace=True)
    if y is not None:
        y.drop(row_idx, axis=0, inplace=True)
        cexc.messages.warn(
            "Some events containing unseen categorical feature values have been skipped while updating the model."
        )
    return X, y


def skip_unseen_target_values(X, y, classes):
    """Remove unseen categorical values from y. Also remove rows which
    corresponds to removed y values from X.

    Args:
        X (dataframe): feature dataframe
        y (pd series): target series
        classes (np array): unique classes

    Returns:
        X (dataframe): feature dataframe
        y (pd series): target series
    """
    new_cat_cols_y = np.setdiff1d(np.unique(y), classes)
    if len(new_cat_cols_y) == 0:
        return X, y

    # Only keep rows where y is a previously seen class
    seen_categorical_mask = y.isin(classes)
    X = X[seen_categorical_mask]
    y = y[seen_categorical_mask]

    cexc.messages.warn(
        'Some events containing unseen categorical target values have been skipped while updating the model.'
    )
    return X, y


def check_and_convert_target_variable(df, field):
    """Check target variable is present, then convert non-null values to str.
    If the target variable is int value but float type, cast to int before
    converting to str, since pandas promotes int type series to float type
    when nans is present.

    Args:
        df (dataframe): input dataframe
        field (str): target field name

    Returns:
        y (pd series): target series
    """
    assert_field_present(df, field)
    y = df[field]
    y_notnull = y[y.notnull()]
    if y.dtype.kind == 'f' and np.all(y_notnull == y_notnull.astype(int)):
        y.loc[y_notnull.index] = y_notnull.astype(int).astype(str)
    else:
        y.loc[y_notnull.index] = y_notnull.astype(str)
    return y


def verify_columns_are_categorical(df, fields):
    """Verify the fields are present in input dataframe and checks that fields are
    only string, boolean or integer and raises an error otherwise.

    Args:
        df: (dataframe): input dataframe
        fields (list): list of field names to split by
    Returns:
        None if no error is raised.
    Raises:
        - Error when the fields are not present in the df
        - Error when elements in dataset does not fall in any of the categories in allowed_types

    """
    for field in fields:
        assert_field_present(df, field)

    allowed_types = (
        int,
        bool,
        str,
        np.bool_,
        np.object_,
        np.int8,
        np.int16,
        np.int32,
        np.int64,
        np.uint8,
        np.uint16,
        np.uint32,
        np.uint64,
    )
    for field, dtype in df[fields].dtypes.items():
        if dtype not in allowed_types:
            raise RuntimeError(
                'This operation allows only string, boolean, and integer values while '
                'field {} is none of these types.'.format(field)
            )


def split_by(df, fields, max_groups=None):
    """Split the rows in input dataframe into groups
    given a set of fields. Optionally check that the
    number of resulted groups does not exceed a number.
    NOTE: this function verifies that fields used in split
    operation are only string, boolean or integer and raises an
    error otherwise.

    Args:
        df: (dataframe): input dataframe
        fields (list): list of field names to split by
        max_groups (int): maximum allowed groups. If the number of resulted
                          groups is larger than this number, an error is raised
    Returns:
        pandas.DataFrameGroupBy: The resulting groups
    Raises:
         - Error when the fields are not present in the df
         - Error when elements in dataset does not fall in any of the categories in allowed_types

    """
    verify_columns_are_categorical(df, fields)
    groups = df.groupby(fields, as_index=False)
    if max_groups and len(groups) > max_groups:
        raise RuntimeError(
            'The number of groups can not exceed {}; the current number of groups is {}. '
            'Please find detailed information about the number of groups in docs.'.format(
                max_groups, len(groups)
            )
        )
    return groups


def remove_duplicates(fields):
    """Remove duplicate elements in fields list.

    Args:
        fields (list): list of field names
    Returns:
        list of distinct field names
    """
    return list(OrderedDict.fromkeys(fields))


def is_empty_df(df):
    return len(df) == 0