You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
649 lines
24 KiB
649 lines
24 KiB
from copy import deepcopy
|
|
from io import StringIO
|
|
|
|
from numpy import sqrt
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
import cexc
|
|
from algos.AutoPrediction import AutoPrediction
|
|
from algos_support.density_function.column_name import make_column_name
|
|
from scorings.classification.Accuracy import AccuracyScoring
|
|
from scorings.classification.PrecisionRecallFscoreSupport import (
|
|
PrecisionRecallFscoreSupportScoring,
|
|
)
|
|
|
|
from scorings.regression.R2 import R2Scoring
|
|
from scorings.regression.MeanSquaredError import MeanSquaredErrorScoring
|
|
from scorings.clustering.Silhouette import SilhouetteScoring
|
|
|
|
logger = cexc.get_logger('LogexperimentStatistics')
|
|
messages = cexc.get_messages_logger()
|
|
|
|
forecastMetadata = '_forecastMetadata'
|
|
|
|
|
|
def get_statistics_metadata(experiment, body):
|
|
"""Load experiment metadata and input data.
|
|
|
|
Args:
|
|
experiment (dict): fetched experiment from REST
|
|
body (csv str): csv string data payload from CEXC
|
|
|
|
Returns:
|
|
exp_metadata (dict or None): experiment metadata, if was accessible.
|
|
- experiment type (str)
|
|
- ground-truth field name (str)
|
|
- predicted field name (str)
|
|
- data containing ground-truth/predicted data (pd.dataframe)
|
|
If not accessible, returns None.
|
|
"""
|
|
debugging_msg_prefix = 'Statistics could not be computed -- {}.'
|
|
|
|
exp_type = experiment.get('type')
|
|
# Get 'main' search stage, as it contains the desired target/feature variables
|
|
search_stages = experiment.get('searchStages', [])
|
|
if len(search_stages) == 0:
|
|
logger.error(debugging_msg_prefix.format('experiment has no searchStages'))
|
|
return None
|
|
|
|
# We assume that each 'stage' is a python dictionary (search stages is a list of dictionaries)
|
|
main_search_stages = [stage for stage in search_stages if stage.get('role') == 'main']
|
|
if len(main_search_stages) != 1:
|
|
logger.error(debugging_msg_prefix.format("require exactly one 'main' search stages."))
|
|
return None
|
|
|
|
# Get the target variables and predicted fields
|
|
main_search_stage = main_search_stages[0]
|
|
|
|
target_variables = main_search_stage.get('targetVariables')
|
|
|
|
if target_variables is None:
|
|
# check for univariate 'targetVariable' field
|
|
target_variable = main_search_stage.get('targetVariable')
|
|
if target_variable is None:
|
|
logger.error(
|
|
debugging_msg_prefix.format(
|
|
'No target variable(s) was found in main search stage.'
|
|
)
|
|
)
|
|
return None
|
|
else:
|
|
target_variables = [target_variable]
|
|
elif not (
|
|
type(target_variables) is list
|
|
and all(isinstance(t, str) for t in target_variables)
|
|
and len(target_variables) > 0
|
|
):
|
|
# check if we have a list, and have at least one string (target column name) in the list
|
|
logger.error(
|
|
debugging_msg_prefix.format(
|
|
'Malformatted "targetVariables" field: {}'.format(target_variables)
|
|
)
|
|
)
|
|
return None
|
|
|
|
if exp_type == 'smart_outlier_detection':
|
|
prefix = "IsOutlier"
|
|
predicted_fields = [
|
|
make_column_name(main_name=prefix, feature_variable=t) for t in target_variables
|
|
]
|
|
elif exp_type == 'smart_clustering':
|
|
predicted_fields = ['cluster']
|
|
else:
|
|
predicted_fields = ['predicted({})'.format(t) for t in target_variables]
|
|
|
|
extract_fields_list = target_variables + predicted_fields
|
|
if exp_type == 'smart_forecasting':
|
|
extract_fields_list = extract_fields_list + [forecastMetadata]
|
|
# pandas require unicode column names
|
|
cols_to_extract = [str(f) for f in (extract_fields_list)]
|
|
|
|
sio = StringIO(body)
|
|
sio.seek(0)
|
|
try: # Ensure that the data contains the target variables and predicted fields
|
|
applied_data = pd.read_csv(sio, usecols=cols_to_extract)
|
|
except ValueError as e:
|
|
msg = "data must contain the columns: {}"
|
|
logger.debug(msg.format(cols_to_extract))
|
|
logger.debug(e)
|
|
return None
|
|
|
|
algorithm_params = (
|
|
search_stages[1].get('algorithmParams')
|
|
if len(search_stages) > 1
|
|
else search_stages[0].get('algorithmParams')
|
|
)
|
|
# Return the metadata required for scoring
|
|
exp_metadata = {
|
|
'type': exp_type,
|
|
'target_variables': target_variables,
|
|
'predicted': predicted_fields,
|
|
'data': applied_data,
|
|
'algorithmParams': algorithm_params,
|
|
}
|
|
return exp_metadata
|
|
|
|
|
|
def get_scoring_results(scoring_class, scoring_opts, data):
|
|
"""Call the scoring function and compute the results.
|
|
|
|
Args:
|
|
scoring_class (scoring obj): object capable of computing score
|
|
scoring_opts (dict): options passed to scoring object
|
|
data (pd.dataframe): data containing predicted and ground-truth data
|
|
|
|
Returns:
|
|
results (pd.dataframe): results of applying scoring function
|
|
"""
|
|
scorer = scoring_class(scoring_opts)
|
|
results = scorer.score(data, scoring_opts)
|
|
return results
|
|
|
|
|
|
def compute_pcf_statistics(exp_metadata, ndigits=2):
|
|
"""Compute the statistics for a PCF experiment.
|
|
|
|
Computes precision, recall, f1 and accuracy scores.
|
|
|
|
-For precision, recall and f1, a 'weighted' averaging
|
|
scheme is used.
|
|
|
|
Args:
|
|
exp_metadata (dict): Metadata of an experiment, containing:
|
|
- experiment type (str)
|
|
- ground-truth field names (list)
|
|
- predicted field names (list)
|
|
- data containing ground-truth/predicted data (pd.dataframe)
|
|
ndigits (int): Number of digits to keep after the decimal place
|
|
|
|
Returns:
|
|
statistics_dict (dict): Dictionary of computed statistics
|
|
"""
|
|
opts_skeleton = {
|
|
'scoring_name': '',
|
|
'params': {},
|
|
'variables': [],
|
|
'a_variables': exp_metadata['target_variables'],
|
|
'b_variables': exp_metadata['predicted'],
|
|
}
|
|
|
|
try:
|
|
# Get the precision, recall and f1-score
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'precision_recall_fscore_support'
|
|
opts['params']['average'] = 'weighted'
|
|
p_r_f_s = get_scoring_results(
|
|
PrecisionRecallFscoreSupportScoring, opts, exp_metadata['data']
|
|
)
|
|
|
|
# Get the accuracy score
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'accuracy_score'
|
|
accuracy = get_scoring_results(AccuracyScoring, opts, exp_metadata['data'])
|
|
|
|
# Create statistics dictionary; keys must be compatible with experiment schema
|
|
statistics_dict = {
|
|
'stats_precision': round(float(p_r_f_s['precision'][0]), ndigits),
|
|
'stats_recall': round(float(p_r_f_s['recall'][0]), ndigits),
|
|
'stats_f1': round(float(p_r_f_s['fbeta_score'][0]), ndigits),
|
|
'stats_accuracy': round(float(accuracy['accuracy_score'][0]), ndigits),
|
|
}
|
|
except Exception as e:
|
|
msg = 'PCF statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
|
|
logger.debug(msg)
|
|
logger.debug(e)
|
|
statistics_dict = {}
|
|
|
|
return statistics_dict
|
|
|
|
|
|
def compute_pnf_statistics(exp_metadata, ndigits_rmse=2, ndigits_r2=4):
|
|
"""Compute the statistics for a PNF experiment.
|
|
|
|
- Computes r^2 (coefficient of determination) and
|
|
RMSE (root mean squared error)
|
|
|
|
Args:
|
|
exp_metadata (dict): Metadata of an experiment, containing:
|
|
- experiment type (str)
|
|
- ground-truth field names (list)
|
|
- predicted field names (list)
|
|
- data containing ground-truth/predicted data (pd.dataframe)
|
|
ndigits_rmse (int): Number of digits to keep after the decimal place
|
|
for root-mean-squared-error metric
|
|
ndigits_r2 (int): Number of digits to keep after the decimal place
|
|
for r^2 metric
|
|
Returns:
|
|
statistics_dict (dict): Dictionary of computed statistics
|
|
"""
|
|
opts_skeleton = {
|
|
'scoring_name': '',
|
|
'params': {},
|
|
'a_variables': exp_metadata['target_variables'],
|
|
'b_variables': exp_metadata['predicted'],
|
|
'variables': [],
|
|
}
|
|
|
|
try:
|
|
# Get the r^2 statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'r2_score'
|
|
r2 = get_scoring_results(R2Scoring, opts, exp_metadata['data'])
|
|
|
|
# Get the RMSE statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'mean_squared_error'
|
|
mse = get_scoring_results(MeanSquaredErrorScoring, opts, exp_metadata['data'])
|
|
|
|
# Create statistics dictionary; keys must be compatible with experiment schema
|
|
statistics_dict = {
|
|
# For r^2, we round to 4 decimal places for historical reasons
|
|
'stats_rSquared': round(float(r2['r2_score'][0]), ndigits_r2),
|
|
# Take the square root to obtain RMSE
|
|
'stats_RMSE': round(sqrt(float(mse['mean_squared_error'][0])), ndigits_rmse),
|
|
}
|
|
|
|
except Exception as e:
|
|
msg = 'PNF statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
|
|
logger.debug(msg)
|
|
logger.debug(e)
|
|
statistics_dict = {}
|
|
|
|
return statistics_dict
|
|
|
|
|
|
def compute_sf_statistics(exp_metadata, ndigits_rmse=2, ndigits_r2=4):
|
|
"""Compute the statistics for a Smart Forecast experiment.
|
|
|
|
- Computes r^2 (coefficient of determination) and
|
|
RMSE (root mean squared error)
|
|
|
|
Args:
|
|
exp_metadata (dict): Metadata of an experiment, containing:
|
|
- experiment type (str)
|
|
- ground-truth field names (list)
|
|
- predicted field names (list)
|
|
- data containing ground-truth/predicted data (pd.dataframe)
|
|
ndigits_rmse (int): Number of digits to keep after the decimal place
|
|
for root-mean-squared-error metric
|
|
ndigits_r2 (int): Number of digits to keep after the decimal place
|
|
for r^2 metric
|
|
Returns:
|
|
statistics_dict (dict): Dictionary of computed statistics
|
|
"""
|
|
opts_skeleton = {
|
|
'scoring_name': '',
|
|
'params': {},
|
|
'a_variables': exp_metadata['target_variables'],
|
|
'b_variables': exp_metadata['predicted'],
|
|
'variables': [],
|
|
}
|
|
|
|
try:
|
|
# Get the r^2 statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'r2_score'
|
|
dataset = exp_metadata['data'].copy()
|
|
exp_metadata['data'] = dataset[(dataset[forecastMetadata] == "hf")]
|
|
r2 = get_scoring_results(R2Scoring, opts, exp_metadata['data'])
|
|
|
|
# Get the RMSE statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'mean_squared_error'
|
|
mse = get_scoring_results(MeanSquaredErrorScoring, opts, exp_metadata['data'])
|
|
|
|
stats_rSquared, stats_RMSE = [], []
|
|
|
|
for idx, t in enumerate(exp_metadata['target_variables']):
|
|
stats_rSquared.append(
|
|
{'key': t, 'value': round(float(r2['r2_score'][idx]), ndigits_r2)}
|
|
) # For r^2, we round to 4 decimal places for historical reasons
|
|
stats_RMSE.append(
|
|
{
|
|
'key': t,
|
|
'value': round(sqrt(float(mse['mean_squared_error'][idx])), ndigits_rmse),
|
|
}
|
|
) # Take the square root to obtain RMSE
|
|
|
|
# Create statistics dictionary; keys must be compatible with experiment schema
|
|
statistics_dict = {'stats_rSquared': stats_rSquared, 'stats_RMSE': stats_RMSE}
|
|
|
|
except Exception as e:
|
|
msg = 'Smart Forecast statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
|
|
logger.debug(msg)
|
|
logger.debug(e)
|
|
statistics_dict = {}
|
|
|
|
return statistics_dict
|
|
|
|
|
|
def compute_soda_statistics(exp_metadata, outlier_value=1.0):
|
|
"""Compute the statistics for a Smart Outlier Detection experiment.
|
|
|
|
- Computes number of outliers given a field to predict on
|
|
|
|
Args:
|
|
exp_metadata (dict): Metadata of an experiment, containing:
|
|
- experiment type (str)
|
|
- ground-truth field names (list)
|
|
- predicted field names (list)
|
|
- data containing ground-truth/predicted data (pd.dataframe)
|
|
outlier_value (int): Default value for entry in data being predicted as outlier
|
|
Returns:
|
|
statistics_dict (dict): Dictionary of computed statistics
|
|
"""
|
|
|
|
try:
|
|
# Get the outlier count statistic
|
|
data = deepcopy(exp_metadata['data'])
|
|
statistics_dict, stats_outlierCount = {}, []
|
|
# Count outliers for each target variable
|
|
for idx, t in enumerate(exp_metadata['target_variables']):
|
|
outlierCol = make_column_name(
|
|
main_name="IsOutlier", feature_variable=exp_metadata['target_variables'][idx]
|
|
)
|
|
outlierCount = int((data[outlierCol] == outlier_value).sum())
|
|
stats_outlierCount.append(
|
|
{'key': exp_metadata['target_variables'][idx], 'value': outlierCount}
|
|
)
|
|
|
|
# Create statistics dictionary; keys must be compatible with experiment schema
|
|
statistics_dict['stats_outlierCount'] = stats_outlierCount
|
|
|
|
except Exception as e:
|
|
msg = 'Smart Outlier Detection statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
|
|
logger.debug(msg)
|
|
logger.debug(e)
|
|
statistics_dict = {}
|
|
|
|
return statistics_dict
|
|
|
|
|
|
def compute_sc_statistics(exp_metadata, ndigits_silhouette=2):
|
|
"""Compute the statistics for a Smart Clustering experiment.
|
|
|
|
- Computes Silhouette Score
|
|
|
|
Args:
|
|
exp_metadata (dict): Metadata of an experiment, containing:
|
|
- experiment type (str)
|
|
- predicted field name (list which includes only label field name)
|
|
- target variables field names (list)
|
|
- data containing target_variables/label data (pd.dataframe)
|
|
ndigits_silhouette (int): Number of digits to keep after the decimal place for silhouette metric
|
|
Returns:
|
|
statistics_dict (dict): Dictionary of computed statistics
|
|
"""
|
|
opts = {
|
|
'scoring_name': 'silhouette_score',
|
|
'params': {},
|
|
'a_variables': exp_metadata['predicted'],
|
|
'b_variables': exp_metadata['target_variables'],
|
|
'variables': [],
|
|
}
|
|
try:
|
|
# Get the silhouette score
|
|
silhouette = get_scoring_results(SilhouetteScoring, opts, exp_metadata['data'])
|
|
# Create statistics dictionary; keys must be compatible with experiment schema
|
|
statistics_dict = {
|
|
'stats_silhouette_score': round(
|
|
float(silhouette['silhouette_score']), ndigits_silhouette
|
|
)
|
|
}
|
|
except Exception as e:
|
|
msg = 'Smart Clustering statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
|
|
logger.debug(msg)
|
|
logger.debug(e)
|
|
statistics_dict = {}
|
|
return statistics_dict
|
|
|
|
|
|
def compute_sp_statistics(exp_metadata, ndigits=2, ndigits_r2=4):
|
|
"""Compute the statistics for a Smart Prediction experiment.
|
|
|
|
- Computes r^2 (coefficient of determination) and
|
|
RMSE (root mean squared error) when the experiment is numerical
|
|
- Computes accuracy, precision, f1_score, recall when the experiment is categorical.
|
|
|
|
Args:
|
|
exp_metadata (dict): Metadata of an experiment, containing:
|
|
- experiment type (str)
|
|
- ground-truth field names (list)
|
|
- predicted field names (list)
|
|
- data containing ground-truth/predicted data (pd.dataframe)
|
|
ndigits (int): Number of digits to keep after the decimal place
|
|
for any scoring metric
|
|
ndigits_r2 (int): Number of digits to keep after the decimal place
|
|
for r^2 metric
|
|
Returns:
|
|
statistics_dict (dict): Dictionary of computed statistics
|
|
"""
|
|
opts_skeleton = {
|
|
'scoring_name': '',
|
|
'params': {},
|
|
'a_variables': exp_metadata.get('target_variables'),
|
|
'b_variables': exp_metadata.get('predicted'),
|
|
'variables': [],
|
|
}
|
|
data = exp_metadata.get('data')
|
|
algo_params = exp_metadata.get('algorithmParams')
|
|
test_split_ratio = algo_params.get('test_split_ratio', 0)
|
|
if test_split_ratio > 0:
|
|
train, test = train_test_split(data, test_size=test_split_ratio)
|
|
else:
|
|
train = data
|
|
test = pd.DataFrame()
|
|
|
|
try:
|
|
# there is only one target variable in Smart Prediction. send it as a field to _is_categorical method
|
|
categorical = AutoPrediction.is_categorical(
|
|
data, exp_metadata.get('target_variables')[0], algo_params
|
|
)
|
|
stats_rSquared, stats_RMSE, stats_accuracy, stats_precision, stats_recall, stats_f1 = (
|
|
[],
|
|
[],
|
|
[],
|
|
[],
|
|
[],
|
|
[],
|
|
)
|
|
if categorical:
|
|
# Get the Accuracy statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'accuracy_score'
|
|
accuracy_train = get_scoring_results(AccuracyScoring, opts, train)
|
|
if test_split_ratio > 0:
|
|
accuracy_test = get_scoring_results(AccuracyScoring, opts, test)
|
|
|
|
# Get the Precision, Recall, F1 statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'precision_recall_fscore_support'
|
|
opts['params'] = {'average': 'weighted'}
|
|
precision_recall_f1_train = get_scoring_results(
|
|
PrecisionRecallFscoreSupportScoring, opts, train
|
|
)
|
|
if test_split_ratio > 0:
|
|
precision_recall_f1_test = get_scoring_results(
|
|
PrecisionRecallFscoreSupportScoring, opts, test
|
|
)
|
|
|
|
# there is only one target_variable in Smart Prediction so we get the first value for all scores
|
|
stats_accuracy.append(
|
|
{
|
|
'key': 'Training',
|
|
'value': round(float(accuracy_train['accuracy_score'][0]), ndigits),
|
|
}
|
|
)
|
|
stats_f1.append(
|
|
{
|
|
'key': 'Training',
|
|
'value': round(float(precision_recall_f1_train['fbeta_score'][0]), ndigits),
|
|
}
|
|
)
|
|
stats_precision.append(
|
|
{
|
|
'key': 'Training',
|
|
'value': round(float(precision_recall_f1_train['precision'][0]), ndigits),
|
|
}
|
|
)
|
|
stats_recall.append(
|
|
{
|
|
'key': 'Training',
|
|
'value': round(float(precision_recall_f1_train['recall'][0]), ndigits),
|
|
}
|
|
)
|
|
# Calculate the test scores only if there is test data
|
|
if test_split_ratio > 0:
|
|
stats_accuracy.append(
|
|
{
|
|
'key': 'Testing',
|
|
'value': round(float(accuracy_test['accuracy_score'][0]), ndigits),
|
|
}
|
|
)
|
|
stats_f1.append(
|
|
{
|
|
'key': 'Testing',
|
|
'value': round(
|
|
float(precision_recall_f1_test['fbeta_score'][0]), ndigits
|
|
),
|
|
}
|
|
)
|
|
stats_precision.append(
|
|
{
|
|
'key': 'Testing',
|
|
'value': round(
|
|
float(precision_recall_f1_test['precision'][0]), ndigits
|
|
),
|
|
}
|
|
)
|
|
stats_recall.append(
|
|
{
|
|
'key': 'Testing',
|
|
'value': round(float(precision_recall_f1_test['recall'][0]), ndigits),
|
|
}
|
|
)
|
|
else:
|
|
# Get the r^2 statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'r2_score'
|
|
r2_train = get_scoring_results(R2Scoring, opts, train)
|
|
if test_split_ratio > 0:
|
|
r2_test = get_scoring_results(R2Scoring, opts, test)
|
|
|
|
# Get the RMSE statistic
|
|
opts = deepcopy(opts_skeleton)
|
|
opts['scoring_name'] = 'mean_squared_error'
|
|
mse_train = get_scoring_results(MeanSquaredErrorScoring, opts, train)
|
|
if test_split_ratio:
|
|
mse_test = get_scoring_results(MeanSquaredErrorScoring, opts, test)
|
|
|
|
stats_rSquared.append(
|
|
{'key': 'Training', 'value': round(float(r2_train['r2_score'][0]), ndigits_r2)}
|
|
)
|
|
stats_RMSE.append(
|
|
{
|
|
'key': 'Training',
|
|
'value': round(sqrt(float(mse_train['mean_squared_error'][0])), ndigits),
|
|
}
|
|
) # Take the square root to obtain RMSE
|
|
|
|
# Calculate the test scores only if there is test data and append them as well
|
|
if test_split_ratio > 0:
|
|
stats_rSquared.append(
|
|
{
|
|
'key': 'Testing',
|
|
'value': round(float(r2_test['r2_score'][0]), ndigits_r2),
|
|
}
|
|
)
|
|
stats_RMSE.append(
|
|
{
|
|
'key': 'Testing',
|
|
'value': round(sqrt(float(mse_test['mean_squared_error'][0])), ndigits),
|
|
}
|
|
)
|
|
|
|
statistics_dict = {
|
|
'stats_rSquared': stats_rSquared,
|
|
'stats_RMSE': stats_RMSE,
|
|
'stats_accuracy': stats_accuracy,
|
|
'stats_f1': stats_f1,
|
|
'stats_precision': stats_precision,
|
|
'stats_recall': stats_recall,
|
|
}
|
|
except Exception as e:
|
|
msg = 'Smart Prediction statistics could not be computed -- failed to evaluate scoring metrics on experiment metadata.'
|
|
logger.debug(msg)
|
|
logger.debug(e)
|
|
statistics_dict = {}
|
|
|
|
return statistics_dict
|
|
|
|
|
|
def _merge_exp_metadata(exp_metadata_list):
|
|
"""
|
|
Merge the list of experiment metadata into a single entry
|
|
|
|
Args:
|
|
exp_metadata_list (list): list of experiment metadata dicts
|
|
(each metadata dict is the output of get_statistics_metadata())
|
|
|
|
Returns:
|
|
exp_metadata (dict): Single experiment metadata with merged 'data' field
|
|
"""
|
|
data_fieldname = 'data'
|
|
data_list = [exp_metadata.get(data_fieldname) for exp_metadata in exp_metadata_list]
|
|
|
|
# Fix the index after merging the data field
|
|
data_df = pd.concat(data_list)
|
|
data_df.index = list(range(len(data_df)))
|
|
|
|
# All metadata entries should have the same fields and values except for the data.
|
|
exp_metadata = exp_metadata_list[-1]
|
|
exp_metadata[data_fieldname] = data_df
|
|
return exp_metadata
|
|
|
|
|
|
def compute_statistics(exp_metadata_list):
|
|
"""Compute the statistics for the given prediction problem.
|
|
|
|
Accepted prediction problems are:
|
|
- Predict categorical fields (PCF)
|
|
|
|
Args:
|
|
exp_metadata_list (list): list of experiment metadata dicts
|
|
(each metadata dict is the output of get_statistics_metadata())
|
|
|
|
Returns:
|
|
statistics (dict): PCF dictionary of statistics results.
|
|
Empty dictionary returned if results could not be calculated.
|
|
"""
|
|
# If any chunk is None, don't compute statistics
|
|
if not all(exp_metadata_list):
|
|
return {}
|
|
|
|
exp_metadata = _merge_exp_metadata(exp_metadata_list)
|
|
|
|
exp_type = exp_metadata.get('type')
|
|
if exp_type == 'predict_categorical_fields':
|
|
statistics_dict = compute_pcf_statistics(exp_metadata)
|
|
elif exp_type == 'predict_numeric_fields':
|
|
statistics_dict = compute_pnf_statistics(exp_metadata)
|
|
elif exp_type == 'cluster_numeric_events':
|
|
# cluster_numeric_events are not implemented yet
|
|
statistics_dict = {}
|
|
elif exp_type == 'smart_forecast':
|
|
statistics_dict = compute_sf_statistics(exp_metadata)
|
|
elif exp_type == 'smart_outlier_detection':
|
|
statistics_dict = compute_soda_statistics(exp_metadata)
|
|
elif exp_type == 'smart_clustering':
|
|
statistics_dict = compute_sc_statistics(exp_metadata)
|
|
elif exp_type == 'smart_prediction':
|
|
statistics_dict = compute_sp_statistics(exp_metadata)
|
|
else:
|
|
logger.debug(
|
|
"Cannot compute experiment statistics on experiment of type: {}.".format(exp_type)
|
|
)
|
|
statistics_dict = {}
|
|
return statistics_dict
|