Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/experiment/evaluation_metrics.py

from copy import deepcopy
from io import StringIO

from numpy import sqrt
import pandas as pd
from sklearn.model_selection import train_test_split

import cexc
from algos.AutoPrediction import AutoPrediction
from algos_support.density_function.column_name import make_column_name
from scorings.classification.Accuracy import AccuracyScoring
from scorings.classification.PrecisionRecallFscoreSupport import (
    PrecisionRecallFscoreSupportScoring,
)

from scorings.regression.R2 import R2Scoring
from scorings.regression.MeanSquaredError import MeanSquaredErrorScoring
from scorings.clustering.Silhouette import SilhouetteScoring

logger = cexc.get_logger('LogexperimentStatistics')
messages = cexc.get_messages_logger()

forecastMetadata = '_forecastMetadata'


def get_statistics_metadata(experiment, body):
    """Load experiment metadata and input data.

    Args:
        experiment (dict): fetched experiment from REST
        body (csv str): csv string data payload from CEXC

    Returns:
        exp_metadata (dict or None): experiment metadata, if was accessible.
            - experiment type (str)
            - ground-truth field name (str)
            - predicted field name (str)
            - data containing ground-truth/predicted data (pd.dataframe)
         If not accessible, returns None.
    """
    debugging_msg_prefix = 'Statistics could not be computed -- {}.'

    exp_type = experiment.get('type')
    # Get 'main' search stage, as it contains the desired target/feature variables
    search_stages = experiment.get('searchStages', [])
    if len(search_stages) == 0:
        logger.error(debugging_msg_prefix.format('experiment has no searchStages'))
        return None

    # We assume that each 'stage' is a python dictionary (search stages is a list of dictionaries)
    main_search_stages = [stage for stage in search_stages if stage.get('role') == 'main']
    if len(main_search_stages) != 1:
        logger.error(debugging_msg_prefix.format("require exactly one 'main' search stages."))
        return None

    # Get the target variables and predicted fields
    main_search_stage = main_search_stages[0]

    target_variables = main_search_stage.get('targetVariables')

    if target_variables is None:
        # check for univariate 'targetVariable' field
        target_variable = main_search_stage.get('targetVariable')
        if target_variable is None:
            logger.error(
                debugging_msg_prefix.format(
                    'No target variable(s) was found in main search stage.'
                )
            )
            return None
        else:
            target_variables = [target_variable]
    elif not (
        type(target_variables) is list
        and all(isinstance(t, str) for t in target_variables)
        and len(target_variables) > 0
    ):
        # check if we have a list, and have at least one string (target column name) in the list
        logger.error(
            debugging_msg_prefix.format(
                'Malformatted "targetVariables" field: {}'.format(target_variables)
            )
        )
        return None

    if exp_type == 'smart_outlier_detection':
        prefix = "IsOutlier"
        predicted_fields = [
            make_column_name(main_name=prefix, feature_variable=t) for t in target_variables
        ]
    elif exp_type == 'smart_clustering':
        predicted_fields = ['cluster']
    else:
        predicted_fields = ['predicted({})'.format(t) for t in target_variables]

    extract_fields_list = target_variables + predicted_fields
    if exp_type == 'smart_forecasting':
        extract_fields_list = extract_fields_list + [forecastMetadata]
    # pandas require unicode column names
    cols_to_extract = [str(f) for f in (extract_fields_list)]

    sio = StringIO(body)
    sio.seek(0)
    try:  # Ensure that the data contains the target variables and predicted fields
        applied_data = pd.read_csv(sio, usecols=cols_to_extract)
    except ValueError as e:
        msg = "data must contain the columns: {}"
        logger.debug(msg.format(cols_to_extract))
        logger.debug(e)
        return None

    algorithm_params = (
        search_stages[1].get('algorithmParams')
        if len(search_stages) > 1
        else search_stages[0].get('algorithmParams')
    )
    # Return the metadata required for scoring
    exp_metadata = {
        'type': exp_type,
        'target_variables': target_variables,
        'predicted': predicted_fields,
        'data': applied_data,
        'algorithmParams': algorithm_params,
    }
    return exp_metadata


def get_scoring_results(scoring_class, scoring_opts, data):
    """Call the scoring function and compute the results.

    Args:
        scoring_class (scoring obj): object capable of computing score
        scoring_opts (dict): options passed to scoring object
        data (pd.dataframe): data containing predicted and ground-truth data

    Returns:
        results (pd.dataframe): results of applying scoring function
    """
    scorer = scoring_class(scoring_opts)
    results = scorer.score(data, scoring_opts)
    return results


def compute_pcf_statistics(exp_metadata, ndigits=2):
    """Compute the statistics for a PCF experiment.

    Computes precision, recall, f1 and accuracy scores.

    -For precision, recall and f1, a 'weighted' averaging
        scheme is used.

    Args:
        exp_metadata (dict): Metadata of an experiment, containing:
            - experiment type (str)
            - ground-truth field names (list)
            - predicted field names (list)
            - data containing ground-truth/predicted data (pd.dataframe)
        ndigits (int): Number of digits to keep after the decimal place

    Returns:
        statistics_dict (dict): Dictionary of computed statistics
    """
    opts_skeleton = {
        'scoring_name': '',
        'params': {},
        'variables': [],
        'a_variables': exp_metadata['target_variables'],
        'b_variables': exp_metadata['predicted'],
    }

    try:
        # Get the precision, recall and f1-score
        opts = deepcopy(opts_skeleton)
        opts['scoring_name'] = 'precision_recall_fscore_support'
        opts['params']['average'] = 'weighted'
        p_r_f_s = get_scoring_results(
            PrecisionRecallFscoreSupportScoring, opts, exp_metadata['data']
        )

        # Get the accuracy score
        opts = deepcopy(opts_skeleton)
        opts['scoring_name'] = 'accuracy_score'
        accuracy = get_scoring_results(AccuracyScoring, opts, exp_metadata['data'])

        # Create statistics dictionary; keys must be compatible with experiment schema
        statistics_dict = {
            'stats_precision': round(float(p_r_f_s['precision'][0]), ndigits),
            'stats_recall': round(float(p_r_f_s['recall'][0]), ndigits),
            'stats_f1': round(float(p_r_f_s['fbeta_score'][0]), ndigits),
            'stats_accuracy': round(float(accuracy['accuracy_score'][0]), ndigits),
        }
    except Exception as e:
        msg = 'PCF statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
        logger.debug(msg)
        logger.debug(e)
        statistics_dict = {}

    return statistics_dict


def compute_pnf_statistics(exp_metadata, ndigits_rmse=2, ndigits_r2=4):
    """Compute the statistics for a PNF experiment.

    - Computes r^2 (coefficient of determination) and
        RMSE (root mean squared error)

    Args:
        exp_metadata (dict): Metadata of an experiment, containing:
            - experiment type (str)
            - ground-truth field names (list)
            - predicted field names (list)
            - data containing ground-truth/predicted data (pd.dataframe)
        ndigits_rmse (int): Number of digits to keep after the decimal place
            for root-mean-squared-error metric
        ndigits_r2 (int): Number of digits to keep after the decimal place
            for r^2 metric
    Returns:
        statistics_dict (dict): Dictionary of computed statistics
    """
    opts_skeleton = {
        'scoring_name': '',
        'params': {},
        'a_variables': exp_metadata['target_variables'],
        'b_variables': exp_metadata['predicted'],
        'variables': [],
    }

    try:
        # Get the r^2 statistic
        opts = deepcopy(opts_skeleton)
        opts['scoring_name'] = 'r2_score'
        r2 = get_scoring_results(R2Scoring, opts, exp_metadata['data'])

        # Get the RMSE statistic
        opts = deepcopy(opts_skeleton)
        opts['scoring_name'] = 'mean_squared_error'
        mse = get_scoring_results(MeanSquaredErrorScoring, opts, exp_metadata['data'])

        # Create statistics dictionary; keys must be compatible with experiment schema
        statistics_dict = {
            # For r^2, we round to 4 decimal places for historical reasons
            'stats_rSquared': round(float(r2['r2_score'][0]), ndigits_r2),
            # Take the square root to obtain RMSE
            'stats_RMSE': round(sqrt(float(mse['mean_squared_error'][0])), ndigits_rmse),
        }

    except Exception as e:
        msg = 'PNF statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
        logger.debug(msg)
        logger.debug(e)
        statistics_dict = {}

    return statistics_dict


def compute_sf_statistics(exp_metadata, ndigits_rmse=2, ndigits_r2=4):
    """Compute the statistics for a Smart Forecast experiment.

    - Computes r^2 (coefficient of determination) and
        RMSE (root mean squared error)

    Args:
        exp_metadata (dict): Metadata of an experiment, containing:
            - experiment type (str)
            - ground-truth field names (list)
            - predicted field names (list)
            - data containing ground-truth/predicted data (pd.dataframe)
        ndigits_rmse (int): Number of digits to keep after the decimal place
            for root-mean-squared-error metric
        ndigits_r2 (int): Number of digits to keep after the decimal place
            for r^2 metric
    Returns:
        statistics_dict (dict): Dictionary of computed statistics
    """
    opts_skeleton = {
        'scoring_name': '',
        'params': {},
        'a_variables': exp_metadata['target_variables'],
        'b_variables': exp_metadata['predicted'],
        'variables': [],
    }

    try:
        # Get the r^2 statistic
        opts = deepcopy(opts_skeleton)
        opts['scoring_name'] = 'r2_score'
        dataset = exp_metadata['data'].copy()
        exp_metadata['data'] = dataset[(dataset[forecastMetadata] == "hf")]
        r2 = get_scoring_results(R2Scoring, opts, exp_metadata['data'])

        # Get the RMSE statistic
        opts = deepcopy(opts_skeleton)
        opts['scoring_name'] = 'mean_squared_error'
        mse = get_scoring_results(MeanSquaredErrorScoring, opts, exp_metadata['data'])

        stats_rSquared, stats_RMSE = [], []

        for idx, t in enumerate(exp_metadata['target_variables']):
            stats_rSquared.append(
                {'key': t, 'value': round(float(r2['r2_score'][idx]), ndigits_r2)}
            )  # For r^2, we round to 4 decimal places for historical reasons
            stats_RMSE.append(
                {
                    'key': t,
                    'value': round(sqrt(float(mse['mean_squared_error'][idx])), ndigits_rmse),
                }
            )  # Take the square root to obtain RMSE

        # Create statistics dictionary; keys must be compatible with experiment schema
        statistics_dict = {'stats_rSquared': stats_rSquared, 'stats_RMSE': stats_RMSE}

    except Exception as e:
        msg = 'Smart Forecast statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
        logger.debug(msg)
        logger.debug(e)
        statistics_dict = {}

    return statistics_dict


def compute_soda_statistics(exp_metadata, outlier_value=1.0):
    """Compute the statistics for a Smart Outlier Detection experiment.

    - Computes number of outliers given a field to predict on

    Args:
        exp_metadata (dict): Metadata of an experiment, containing:
            - experiment type (str)
            - ground-truth field names (list)
            - predicted field names (list)
            - data containing ground-truth/predicted data (pd.dataframe)
        outlier_value (int): Default value for entry in data being predicted as outlier
    Returns:
        statistics_dict (dict): Dictionary of computed statistics
    """

    try:
        # Get the outlier count statistic
        data = deepcopy(exp_metadata['data'])
        statistics_dict, stats_outlierCount = {}, []
        # Count outliers for each target variable
        for idx, t in enumerate(exp_metadata['target_variables']):
            outlierCol = make_column_name(
                main_name="IsOutlier", feature_variable=exp_metadata['target_variables'][idx]
            )
            outlierCount = int((data[outlierCol] == outlier_value).sum())
            stats_outlierCount.append(
                {'key': exp_metadata['target_variables'][idx], 'value': outlierCount}
            )

        # Create statistics dictionary; keys must be compatible with experiment schema
        statistics_dict['stats_outlierCount'] = stats_outlierCount

    except Exception as e:
        msg = 'Smart Outlier Detection statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
        logger.debug(msg)
        logger.debug(e)
        statistics_dict = {}

    return statistics_dict


def compute_sc_statistics(exp_metadata, ndigits_silhouette=2):
    """Compute the statistics for a Smart Clustering experiment.

    - Computes Silhouette Score

    Args:
        exp_metadata (dict): Metadata of an experiment, containing:
            - experiment type (str)
            - predicted field name (list which includes only label field name)
            - target variables field names (list)
            - data containing target_variables/label data (pd.dataframe)
        ndigits_silhouette (int): Number of digits to keep after the decimal place for silhouette metric
    Returns:
        statistics_dict (dict): Dictionary of computed statistics
    """
    opts = {
        'scoring_name': 'silhouette_score',
        'params': {},
        'a_variables': exp_metadata['predicted'],
        'b_variables': exp_metadata['target_variables'],
        'variables': [],
    }
    try:
        # Get the silhouette score
        silhouette = get_scoring_results(SilhouetteScoring, opts, exp_metadata['data'])
        # Create statistics dictionary; keys must be compatible with experiment schema
        statistics_dict = {
            'stats_silhouette_score': round(
                float(silhouette['silhouette_score']), ndigits_silhouette
            )
        }
    except Exception as e:
        msg = 'Smart Clustering statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
        logger.debug(msg)
        logger.debug(e)
        statistics_dict = {}
    return statistics_dict


def compute_sp_statistics(exp_metadata, ndigits=2, ndigits_r2=4):
    """Compute the statistics for a Smart Prediction experiment.

    - Computes r^2 (coefficient of determination) and
        RMSE (root mean squared error) when the experiment is numerical
    - Computes accuracy, precision, f1_score, recall when the experiment is categorical.

    Args:
        exp_metadata (dict): Metadata of an experiment, containing:
            - experiment type (str)
            - ground-truth field names (list)
            - predicted field names (list)
            - data containing ground-truth/predicted data (pd.dataframe)
        ndigits (int): Number of digits to keep after the decimal place
            for any scoring metric
        ndigits_r2 (int): Number of digits to keep after the decimal place
            for r^2 metric
    Returns:
        statistics_dict (dict): Dictionary of computed statistics
    """
    opts_skeleton = {
        'scoring_name': '',
        'params': {},
        'a_variables': exp_metadata.get('target_variables'),
        'b_variables': exp_metadata.get('predicted'),
        'variables': [],
    }
    data = exp_metadata.get('data')
    algo_params = exp_metadata.get('algorithmParams')
    test_split_ratio = algo_params.get('test_split_ratio', 0)
    if test_split_ratio > 0:
        train, test = train_test_split(data, test_size=test_split_ratio)
    else:
        train = data
        test = pd.DataFrame()

    try:
        # there is only one target variable in Smart Prediction. send it as a field to _is_categorical method
        categorical = AutoPrediction.is_categorical(
            data, exp_metadata.get('target_variables')[0], algo_params
        )
        stats_rSquared, stats_RMSE, stats_accuracy, stats_precision, stats_recall, stats_f1 = (
            [],
            [],
            [],
            [],
            [],
            [],
        )
        if categorical:
            # Get the Accuracy statistic
            opts = deepcopy(opts_skeleton)
            opts['scoring_name'] = 'accuracy_score'
            accuracy_train = get_scoring_results(AccuracyScoring, opts, train)
            if test_split_ratio > 0:
                accuracy_test = get_scoring_results(AccuracyScoring, opts, test)

            # Get the Precision, Recall, F1 statistic
            opts = deepcopy(opts_skeleton)
            opts['scoring_name'] = 'precision_recall_fscore_support'
            opts['params'] = {'average': 'weighted'}
            precision_recall_f1_train = get_scoring_results(
                PrecisionRecallFscoreSupportScoring, opts, train
            )
            if test_split_ratio > 0:
                precision_recall_f1_test = get_scoring_results(
                    PrecisionRecallFscoreSupportScoring, opts, test
                )

            # there is only one target_variable in Smart Prediction so we get the first value for all scores
            stats_accuracy.append(
                {
                    'key': 'Training',
                    'value': round(float(accuracy_train['accuracy_score'][0]), ndigits),
                }
            )
            stats_f1.append(
                {
                    'key': 'Training',
                    'value': round(float(precision_recall_f1_train['fbeta_score'][0]), ndigits),
                }
            )
            stats_precision.append(
                {
                    'key': 'Training',
                    'value': round(float(precision_recall_f1_train['precision'][0]), ndigits),
                }
            )
            stats_recall.append(
                {
                    'key': 'Training',
                    'value': round(float(precision_recall_f1_train['recall'][0]), ndigits),
                }
            )
            # Calculate the test scores only if there is test data
            if test_split_ratio > 0:
                stats_accuracy.append(
                    {
                        'key': 'Testing',
                        'value': round(float(accuracy_test['accuracy_score'][0]), ndigits),
                    }
                )
                stats_f1.append(
                    {
                        'key': 'Testing',
                        'value': round(
                            float(precision_recall_f1_test['fbeta_score'][0]), ndigits
                        ),
                    }
                )
                stats_precision.append(
                    {
                        'key': 'Testing',
                        'value': round(
                            float(precision_recall_f1_test['precision'][0]), ndigits
                        ),
                    }
                )
                stats_recall.append(
                    {
                        'key': 'Testing',
                        'value': round(float(precision_recall_f1_test['recall'][0]), ndigits),
                    }
                )
        else:
            # Get the r^2 statistic
            opts = deepcopy(opts_skeleton)
            opts['scoring_name'] = 'r2_score'
            r2_train = get_scoring_results(R2Scoring, opts, train)
            if test_split_ratio > 0:
                r2_test = get_scoring_results(R2Scoring, opts, test)

            # Get the RMSE statistic
            opts = deepcopy(opts_skeleton)
            opts['scoring_name'] = 'mean_squared_error'
            mse_train = get_scoring_results(MeanSquaredErrorScoring, opts, train)
            if test_split_ratio:
                mse_test = get_scoring_results(MeanSquaredErrorScoring, opts, test)

            stats_rSquared.append(
                {'key': 'Training', 'value': round(float(r2_train['r2_score'][0]), ndigits_r2)}
            )
            stats_RMSE.append(
                {
                    'key': 'Training',
                    'value': round(sqrt(float(mse_train['mean_squared_error'][0])), ndigits),
                }
            )  # Take the square root to obtain RMSE

            # Calculate the test scores only if there is test data and append them as well
            if test_split_ratio > 0:
                stats_rSquared.append(
                    {
                        'key': 'Testing',
                        'value': round(float(r2_test['r2_score'][0]), ndigits_r2),
                    }
                )
                stats_RMSE.append(
                    {
                        'key': 'Testing',
                        'value': round(sqrt(float(mse_test['mean_squared_error'][0])), ndigits),
                    }
                )

        statistics_dict = {
            'stats_rSquared': stats_rSquared,
            'stats_RMSE': stats_RMSE,
            'stats_accuracy': stats_accuracy,
            'stats_f1': stats_f1,
            'stats_precision': stats_precision,
            'stats_recall': stats_recall,
        }
    except Exception as e:
        msg = 'Smart Prediction statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
        logger.debug(msg)
        logger.debug(e)
        statistics_dict = {}

    return statistics_dict


def _merge_exp_metadata(exp_metadata_list):
    """
    Merge the list of experiment metadata into a single entry

    Args:
        exp_metadata_list (list): list of experiment metadata dicts
            (each metadata dict is the output of get_statistics_metadata())

    Returns:
        exp_metadata (dict): Single experiment metadata with merged 'data' field
    """
    data_fieldname = 'data'
    data_list = [exp_metadata.get(data_fieldname) for exp_metadata in exp_metadata_list]

    # Fix the index after merging the data field
    data_df = pd.concat(data_list)
    data_df.index = list(range(len(data_df)))

    # All metadata entries should have the same fields and values except for the data.
    exp_metadata = exp_metadata_list[-1]
    exp_metadata[data_fieldname] = data_df
    return exp_metadata


def compute_statistics(exp_metadata_list):
    """Compute the statistics for the given prediction problem.

    Accepted prediction problems are:
        - Predict categorical fields (PCF)

    Args:
        exp_metadata_list (list): list of experiment metadata dicts
            (each metadata dict is the output of get_statistics_metadata())

    Returns:
        statistics (dict): PCF dictionary of statistics results.
            Empty dictionary returned if results could not be calculated.
    """
    # If any chunk is None, don't compute statistics
    if not all(exp_metadata_list):
        return {}

    exp_metadata = _merge_exp_metadata(exp_metadata_list)

    exp_type = exp_metadata.get('type')
    if exp_type == 'predict_categorical_fields':
        statistics_dict = compute_pcf_statistics(exp_metadata)
    elif exp_type == 'predict_numeric_fields':
        statistics_dict = compute_pnf_statistics(exp_metadata)
    elif exp_type == 'cluster_numeric_events':
        # cluster_numeric_events are not implemented yet
        statistics_dict = {}
    elif exp_type == 'smart_forecast':
        statistics_dict = compute_sf_statistics(exp_metadata)
    elif exp_type == 'smart_outlier_detection':
        statistics_dict = compute_soda_statistics(exp_metadata)
    elif exp_type == 'smart_clustering':
        statistics_dict = compute_sc_statistics(exp_metadata)
    elif exp_type == 'smart_prediction':
        statistics_dict = compute_sp_statistics(exp_metadata)
    else:
        logger.debug(
            "Cannot compute experiment statistics on experiment of type: {}.".format(exp_type)
        )
        statistics_dict = {}
    return statistics_dict