You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/experiment/evaluation_metrics.py

649 lines
24 KiB

from copy import deepcopy
from io import StringIO
from numpy import sqrt
import pandas as pd
from sklearn.model_selection import train_test_split
import cexc
from algos.AutoPrediction import AutoPrediction
from algos_support.density_function.column_name import make_column_name
from scorings.classification.Accuracy import AccuracyScoring
from scorings.classification.PrecisionRecallFscoreSupport import (
PrecisionRecallFscoreSupportScoring,
)
from scorings.regression.R2 import R2Scoring
from scorings.regression.MeanSquaredError import MeanSquaredErrorScoring
from scorings.clustering.Silhouette import SilhouetteScoring
logger = cexc.get_logger('LogexperimentStatistics')
messages = cexc.get_messages_logger()
forecastMetadata = '_forecastMetadata'
def get_statistics_metadata(experiment, body):
"""Load experiment metadata and input data.
Args:
experiment (dict): fetched experiment from REST
body (csv str): csv string data payload from CEXC
Returns:
exp_metadata (dict or None): experiment metadata, if was accessible.
- experiment type (str)
- ground-truth field name (str)
- predicted field name (str)
- data containing ground-truth/predicted data (pd.dataframe)
If not accessible, returns None.
"""
debugging_msg_prefix = 'Statistics could not be computed -- {}.'
exp_type = experiment.get('type')
# Get 'main' search stage, as it contains the desired target/feature variables
search_stages = experiment.get('searchStages', [])
if len(search_stages) == 0:
logger.error(debugging_msg_prefix.format('experiment has no searchStages'))
return None
# We assume that each 'stage' is a python dictionary (search stages is a list of dictionaries)
main_search_stages = [stage for stage in search_stages if stage.get('role') == 'main']
if len(main_search_stages) != 1:
logger.error(debugging_msg_prefix.format("require exactly one 'main' search stages."))
return None
# Get the target variables and predicted fields
main_search_stage = main_search_stages[0]
target_variables = main_search_stage.get('targetVariables')
if target_variables is None:
# check for univariate 'targetVariable' field
target_variable = main_search_stage.get('targetVariable')
if target_variable is None:
logger.error(
debugging_msg_prefix.format(
'No target variable(s) was found in main search stage.'
)
)
return None
else:
target_variables = [target_variable]
elif not (
type(target_variables) is list
and all(isinstance(t, str) for t in target_variables)
and len(target_variables) > 0
):
# check if we have a list, and have at least one string (target column name) in the list
logger.error(
debugging_msg_prefix.format(
'Malformatted "targetVariables" field: {}'.format(target_variables)
)
)
return None
if exp_type == 'smart_outlier_detection':
prefix = "IsOutlier"
predicted_fields = [
make_column_name(main_name=prefix, feature_variable=t) for t in target_variables
]
elif exp_type == 'smart_clustering':
predicted_fields = ['cluster']
else:
predicted_fields = ['predicted({})'.format(t) for t in target_variables]
extract_fields_list = target_variables + predicted_fields
if exp_type == 'smart_forecasting':
extract_fields_list = extract_fields_list + [forecastMetadata]
# pandas require unicode column names
cols_to_extract = [str(f) for f in (extract_fields_list)]
sio = StringIO(body)
sio.seek(0)
try: # Ensure that the data contains the target variables and predicted fields
applied_data = pd.read_csv(sio, usecols=cols_to_extract)
except ValueError as e:
msg = "data must contain the columns: {}"
logger.debug(msg.format(cols_to_extract))
logger.debug(e)
return None
algorithm_params = (
search_stages[1].get('algorithmParams')
if len(search_stages) > 1
else search_stages[0].get('algorithmParams')
)
# Return the metadata required for scoring
exp_metadata = {
'type': exp_type,
'target_variables': target_variables,
'predicted': predicted_fields,
'data': applied_data,
'algorithmParams': algorithm_params,
}
return exp_metadata
def get_scoring_results(scoring_class, scoring_opts, data):
"""Call the scoring function and compute the results.
Args:
scoring_class (scoring obj): object capable of computing score
scoring_opts (dict): options passed to scoring object
data (pd.dataframe): data containing predicted and ground-truth data
Returns:
results (pd.dataframe): results of applying scoring function
"""
scorer = scoring_class(scoring_opts)
results = scorer.score(data, scoring_opts)
return results
def compute_pcf_statistics(exp_metadata, ndigits=2):
"""Compute the statistics for a PCF experiment.
Computes precision, recall, f1 and accuracy scores.
-For precision, recall and f1, a 'weighted' averaging
scheme is used.
Args:
exp_metadata (dict): Metadata of an experiment, containing:
- experiment type (str)
- ground-truth field names (list)
- predicted field names (list)
- data containing ground-truth/predicted data (pd.dataframe)
ndigits (int): Number of digits to keep after the decimal place
Returns:
statistics_dict (dict): Dictionary of computed statistics
"""
opts_skeleton = {
'scoring_name': '',
'params': {},
'variables': [],
'a_variables': exp_metadata['target_variables'],
'b_variables': exp_metadata['predicted'],
}
try:
# Get the precision, recall and f1-score
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'precision_recall_fscore_support'
opts['params']['average'] = 'weighted'
p_r_f_s = get_scoring_results(
PrecisionRecallFscoreSupportScoring, opts, exp_metadata['data']
)
# Get the accuracy score
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'accuracy_score'
accuracy = get_scoring_results(AccuracyScoring, opts, exp_metadata['data'])
# Create statistics dictionary; keys must be compatible with experiment schema
statistics_dict = {
'stats_precision': round(float(p_r_f_s['precision'][0]), ndigits),
'stats_recall': round(float(p_r_f_s['recall'][0]), ndigits),
'stats_f1': round(float(p_r_f_s['fbeta_score'][0]), ndigits),
'stats_accuracy': round(float(accuracy['accuracy_score'][0]), ndigits),
}
except Exception as e:
msg = 'PCF statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
logger.debug(msg)
logger.debug(e)
statistics_dict = {}
return statistics_dict
def compute_pnf_statistics(exp_metadata, ndigits_rmse=2, ndigits_r2=4):
"""Compute the statistics for a PNF experiment.
- Computes r^2 (coefficient of determination) and
RMSE (root mean squared error)
Args:
exp_metadata (dict): Metadata of an experiment, containing:
- experiment type (str)
- ground-truth field names (list)
- predicted field names (list)
- data containing ground-truth/predicted data (pd.dataframe)
ndigits_rmse (int): Number of digits to keep after the decimal place
for root-mean-squared-error metric
ndigits_r2 (int): Number of digits to keep after the decimal place
for r^2 metric
Returns:
statistics_dict (dict): Dictionary of computed statistics
"""
opts_skeleton = {
'scoring_name': '',
'params': {},
'a_variables': exp_metadata['target_variables'],
'b_variables': exp_metadata['predicted'],
'variables': [],
}
try:
# Get the r^2 statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'r2_score'
r2 = get_scoring_results(R2Scoring, opts, exp_metadata['data'])
# Get the RMSE statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'mean_squared_error'
mse = get_scoring_results(MeanSquaredErrorScoring, opts, exp_metadata['data'])
# Create statistics dictionary; keys must be compatible with experiment schema
statistics_dict = {
# For r^2, we round to 4 decimal places for historical reasons
'stats_rSquared': round(float(r2['r2_score'][0]), ndigits_r2),
# Take the square root to obtain RMSE
'stats_RMSE': round(sqrt(float(mse['mean_squared_error'][0])), ndigits_rmse),
}
except Exception as e:
msg = 'PNF statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
logger.debug(msg)
logger.debug(e)
statistics_dict = {}
return statistics_dict
def compute_sf_statistics(exp_metadata, ndigits_rmse=2, ndigits_r2=4):
"""Compute the statistics for a Smart Forecast experiment.
- Computes r^2 (coefficient of determination) and
RMSE (root mean squared error)
Args:
exp_metadata (dict): Metadata of an experiment, containing:
- experiment type (str)
- ground-truth field names (list)
- predicted field names (list)
- data containing ground-truth/predicted data (pd.dataframe)
ndigits_rmse (int): Number of digits to keep after the decimal place
for root-mean-squared-error metric
ndigits_r2 (int): Number of digits to keep after the decimal place
for r^2 metric
Returns:
statistics_dict (dict): Dictionary of computed statistics
"""
opts_skeleton = {
'scoring_name': '',
'params': {},
'a_variables': exp_metadata['target_variables'],
'b_variables': exp_metadata['predicted'],
'variables': [],
}
try:
# Get the r^2 statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'r2_score'
dataset = exp_metadata['data'].copy()
exp_metadata['data'] = dataset[(dataset[forecastMetadata] == "hf")]
r2 = get_scoring_results(R2Scoring, opts, exp_metadata['data'])
# Get the RMSE statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'mean_squared_error'
mse = get_scoring_results(MeanSquaredErrorScoring, opts, exp_metadata['data'])
stats_rSquared, stats_RMSE = [], []
for idx, t in enumerate(exp_metadata['target_variables']):
stats_rSquared.append(
{'key': t, 'value': round(float(r2['r2_score'][idx]), ndigits_r2)}
) # For r^2, we round to 4 decimal places for historical reasons
stats_RMSE.append(
{
'key': t,
'value': round(sqrt(float(mse['mean_squared_error'][idx])), ndigits_rmse),
}
) # Take the square root to obtain RMSE
# Create statistics dictionary; keys must be compatible with experiment schema
statistics_dict = {'stats_rSquared': stats_rSquared, 'stats_RMSE': stats_RMSE}
except Exception as e:
msg = 'Smart Forecast statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
logger.debug(msg)
logger.debug(e)
statistics_dict = {}
return statistics_dict
def compute_soda_statistics(exp_metadata, outlier_value=1.0):
"""Compute the statistics for a Smart Outlier Detection experiment.
- Computes number of outliers given a field to predict on
Args:
exp_metadata (dict): Metadata of an experiment, containing:
- experiment type (str)
- ground-truth field names (list)
- predicted field names (list)
- data containing ground-truth/predicted data (pd.dataframe)
outlier_value (int): Default value for entry in data being predicted as outlier
Returns:
statistics_dict (dict): Dictionary of computed statistics
"""
try:
# Get the outlier count statistic
data = deepcopy(exp_metadata['data'])
statistics_dict, stats_outlierCount = {}, []
# Count outliers for each target variable
for idx, t in enumerate(exp_metadata['target_variables']):
outlierCol = make_column_name(
main_name="IsOutlier", feature_variable=exp_metadata['target_variables'][idx]
)
outlierCount = int((data[outlierCol] == outlier_value).sum())
stats_outlierCount.append(
{'key': exp_metadata['target_variables'][idx], 'value': outlierCount}
)
# Create statistics dictionary; keys must be compatible with experiment schema
statistics_dict['stats_outlierCount'] = stats_outlierCount
except Exception as e:
msg = 'Smart Outlier Detection statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
logger.debug(msg)
logger.debug(e)
statistics_dict = {}
return statistics_dict
def compute_sc_statistics(exp_metadata, ndigits_silhouette=2):
"""Compute the statistics for a Smart Clustering experiment.
- Computes Silhouette Score
Args:
exp_metadata (dict): Metadata of an experiment, containing:
- experiment type (str)
- predicted field name (list which includes only label field name)
- target variables field names (list)
- data containing target_variables/label data (pd.dataframe)
ndigits_silhouette (int): Number of digits to keep after the decimal place for silhouette metric
Returns:
statistics_dict (dict): Dictionary of computed statistics
"""
opts = {
'scoring_name': 'silhouette_score',
'params': {},
'a_variables': exp_metadata['predicted'],
'b_variables': exp_metadata['target_variables'],
'variables': [],
}
try:
# Get the silhouette score
silhouette = get_scoring_results(SilhouetteScoring, opts, exp_metadata['data'])
# Create statistics dictionary; keys must be compatible with experiment schema
statistics_dict = {
'stats_silhouette_score': round(
float(silhouette['silhouette_score']), ndigits_silhouette
)
}
except Exception as e:
msg = 'Smart Clustering statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
logger.debug(msg)
logger.debug(e)
statistics_dict = {}
return statistics_dict
def compute_sp_statistics(exp_metadata, ndigits=2, ndigits_r2=4):
"""Compute the statistics for a Smart Prediction experiment.
- Computes r^2 (coefficient of determination) and
RMSE (root mean squared error) when the experiment is numerical
- Computes accuracy, precision, f1_score, recall when the experiment is categorical.
Args:
exp_metadata (dict): Metadata of an experiment, containing:
- experiment type (str)
- ground-truth field names (list)
- predicted field names (list)
- data containing ground-truth/predicted data (pd.dataframe)
ndigits (int): Number of digits to keep after the decimal place
for any scoring metric
ndigits_r2 (int): Number of digits to keep after the decimal place
for r^2 metric
Returns:
statistics_dict (dict): Dictionary of computed statistics
"""
opts_skeleton = {
'scoring_name': '',
'params': {},
'a_variables': exp_metadata.get('target_variables'),
'b_variables': exp_metadata.get('predicted'),
'variables': [],
}
data = exp_metadata.get('data')
algo_params = exp_metadata.get('algorithmParams')
test_split_ratio = algo_params.get('test_split_ratio', 0)
if test_split_ratio > 0:
train, test = train_test_split(data, test_size=test_split_ratio)
else:
train = data
test = pd.DataFrame()
try:
# there is only one target variable in Smart Prediction. send it as a field to _is_categorical method
categorical = AutoPrediction.is_categorical(
data, exp_metadata.get('target_variables')[0], algo_params
)
stats_rSquared, stats_RMSE, stats_accuracy, stats_precision, stats_recall, stats_f1 = (
[],
[],
[],
[],
[],
[],
)
if categorical:
# Get the Accuracy statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'accuracy_score'
accuracy_train = get_scoring_results(AccuracyScoring, opts, train)
if test_split_ratio > 0:
accuracy_test = get_scoring_results(AccuracyScoring, opts, test)
# Get the Precision, Recall, F1 statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'precision_recall_fscore_support'
opts['params'] = {'average': 'weighted'}
precision_recall_f1_train = get_scoring_results(
PrecisionRecallFscoreSupportScoring, opts, train
)
if test_split_ratio > 0:
precision_recall_f1_test = get_scoring_results(
PrecisionRecallFscoreSupportScoring, opts, test
)
# there is only one target_variable in Smart Prediction so we get the first value for all scores
stats_accuracy.append(
{
'key': 'Training',
'value': round(float(accuracy_train['accuracy_score'][0]), ndigits),
}
)
stats_f1.append(
{
'key': 'Training',
'value': round(float(precision_recall_f1_train['fbeta_score'][0]), ndigits),
}
)
stats_precision.append(
{
'key': 'Training',
'value': round(float(precision_recall_f1_train['precision'][0]), ndigits),
}
)
stats_recall.append(
{
'key': 'Training',
'value': round(float(precision_recall_f1_train['recall'][0]), ndigits),
}
)
# Calculate the test scores only if there is test data
if test_split_ratio > 0:
stats_accuracy.append(
{
'key': 'Testing',
'value': round(float(accuracy_test['accuracy_score'][0]), ndigits),
}
)
stats_f1.append(
{
'key': 'Testing',
'value': round(
float(precision_recall_f1_test['fbeta_score'][0]), ndigits
),
}
)
stats_precision.append(
{
'key': 'Testing',
'value': round(
float(precision_recall_f1_test['precision'][0]), ndigits
),
}
)
stats_recall.append(
{
'key': 'Testing',
'value': round(float(precision_recall_f1_test['recall'][0]), ndigits),
}
)
else:
# Get the r^2 statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'r2_score'
r2_train = get_scoring_results(R2Scoring, opts, train)
if test_split_ratio > 0:
r2_test = get_scoring_results(R2Scoring, opts, test)
# Get the RMSE statistic
opts = deepcopy(opts_skeleton)
opts['scoring_name'] = 'mean_squared_error'
mse_train = get_scoring_results(MeanSquaredErrorScoring, opts, train)
if test_split_ratio:
mse_test = get_scoring_results(MeanSquaredErrorScoring, opts, test)
stats_rSquared.append(
{'key': 'Training', 'value': round(float(r2_train['r2_score'][0]), ndigits_r2)}
)
stats_RMSE.append(
{
'key': 'Training',
'value': round(sqrt(float(mse_train['mean_squared_error'][0])), ndigits),
}
) # Take the square root to obtain RMSE
# Calculate the test scores only if there is test data and append them as well
if test_split_ratio > 0:
stats_rSquared.append(
{
'key': 'Testing',
'value': round(float(r2_test['r2_score'][0]), ndigits_r2),
}
)
stats_RMSE.append(
{
'key': 'Testing',
'value': round(sqrt(float(mse_test['mean_squared_error'][0])), ndigits),
}
)
statistics_dict = {
'stats_rSquared': stats_rSquared,
'stats_RMSE': stats_RMSE,
'stats_accuracy': stats_accuracy,
'stats_f1': stats_f1,
'stats_precision': stats_precision,
'stats_recall': stats_recall,
}
except Exception as e:
msg = 'Smart Prediction statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
logger.debug(msg)
logger.debug(e)
statistics_dict = {}
return statistics_dict
def _merge_exp_metadata(exp_metadata_list):
"""
Merge the list of experiment metadata into a single entry
Args:
exp_metadata_list (list): list of experiment metadata dicts
(each metadata dict is the output of get_statistics_metadata())
Returns:
exp_metadata (dict): Single experiment metadata with merged 'data' field
"""
data_fieldname = 'data'
data_list = [exp_metadata.get(data_fieldname) for exp_metadata in exp_metadata_list]
# Fix the index after merging the data field
data_df = pd.concat(data_list)
data_df.index = list(range(len(data_df)))
# All metadata entries should have the same fields and values except for the data.
exp_metadata = exp_metadata_list[-1]
exp_metadata[data_fieldname] = data_df
return exp_metadata
def compute_statistics(exp_metadata_list):
"""Compute the statistics for the given prediction problem.
Accepted prediction problems are:
- Predict categorical fields (PCF)
Args:
exp_metadata_list (list): list of experiment metadata dicts
(each metadata dict is the output of get_statistics_metadata())
Returns:
statistics (dict): PCF dictionary of statistics results.
Empty dictionary returned if results could not be calculated.
"""
# If any chunk is None, don't compute statistics
if not all(exp_metadata_list):
return {}
exp_metadata = _merge_exp_metadata(exp_metadata_list)
exp_type = exp_metadata.get('type')
if exp_type == 'predict_categorical_fields':
statistics_dict = compute_pcf_statistics(exp_metadata)
elif exp_type == 'predict_numeric_fields':
statistics_dict = compute_pnf_statistics(exp_metadata)
elif exp_type == 'cluster_numeric_events':
# cluster_numeric_events are not implemented yet
statistics_dict = {}
elif exp_type == 'smart_forecast':
statistics_dict = compute_sf_statistics(exp_metadata)
elif exp_type == 'smart_outlier_detection':
statistics_dict = compute_soda_statistics(exp_metadata)
elif exp_type == 'smart_clustering':
statistics_dict = compute_sc_statistics(exp_metadata)
elif exp_type == 'smart_prediction':
statistics_dict = compute_sp_statistics(exp_metadata)
else:
logger.debug(
"Cannot compute experiment statistics on experiment of type: {}.".format(exp_type)
)
statistics_dict = {}
return statistics_dict