You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

766 lines
33 KiB

#!/usr/bin/env python
import json
from collections import defaultdict
from collections import OrderedDict
from itertools import chain
from functools import partial
import numpy as np
import pandas as pd
from vendor.toolz import itertoolz
from algos_support.density_function.column_name import make_column_name
from algos_support.density_function.distance_metric import DistanceMetric
from algos_support.density_function.distribution_factory import get_distribution
from algos_support.density_function.outlier_threshold import OutlierThreshold
from algos_support.density_function.probability_distribution import (
ApplyParams,
DistributionType,
DistributionName,
ProbabilityDistribution,
)
from base import BaseAlgo
from codec import codecs_manager
from codec.codecs import SimpleObjectCodec
from util import df_util
from util.multivalue_util import multivalue_encode
from util.numpy_util import safe_str_infinity
from util.param_util import convert_params
import cexc
logger = cexc.get_logger(__name__)
messages = cexc.get_messages_logger()
class LabeledDistribution(object):
"""A serializable named tuple to group one instance of
a ProbabilityDistribution and a set of field names/values
that form the instance's corresponding group"""
def __init__(self, dist, fields):
"""
Args:
dist (ProbabilityDistribution): A probability distribution
fields (dict): Mapping of field names to field values
"""
self.dist = dist
self.fields = fields
class DensityFunction(BaseAlgo):
OUTPUT_NAME = 'IsOutlier'
SAMPLED_VALUE = 'SampledValue'
BOUNDARY_RANGES = 'BoundaryRanges'
OPTIONS = '_Options'
def __init__(self, options):
DensityFunction._handle_options(options)
self._params = convert_params(
options.get('params', {}),
strs=['dist', 'metric', 'show_options', "exclude_dist"],
bools=['show_density', 'full_sample', 'sample'],
multiple_floats=['threshold', 'lower_threshold', 'upper_threshold'],
ints=['random_state'],
)
acceptable_dists = (
DistributionType.AUTO,
DistributionType.NORMAL,
DistributionType.EXPONENTIAL,
DistributionType.GAUSSIAN_KDE,
DistributionType.BETA,
)
acceptable_exclude_dist = (
DistributionType.NORMAL,
DistributionType.EXPONENTIAL,
DistributionType.GAUSSIAN_KDE,
DistributionType.BETA,
)
self._dist_type = self._params.pop('dist', DistributionType.AUTO)
if self._dist_type not in acceptable_dists:
msg = 'Invalid value error: dist must be one of {}, but found dist="{}".'
dists = ', '.join(['\"{}\"'.format(x) for x in acceptable_dists])
raise RuntimeError(msg.format(dists, self._dist_type))
self._exclude_dist = None
exclude_string = self._params.pop('exclude_dist', None)
if self._dist_type == DistributionType.AUTO and exclude_string:
self._exclude_dist = exclude_string.split(",")
excludes = ', '.join(['\"{}\"'.format(x) for x in acceptable_exclude_dist])
for i in range(len(self._exclude_dist)):
self._exclude_dist[i] = self._exclude_dist[i].strip()
if self._exclude_dist[i] not in acceptable_exclude_dist:
msg = 'Invalid value error: exclude_dist must be one or more of {}, but found "{}" in exclude_dist.'
raise RuntimeError(msg.format(excludes, self._exclude_dist[i]))
if sorted(self._exclude_dist) == sorted(list(acceptable_exclude_dist)):
raise RuntimeError(
f"You cannot exclude all of the distribution types when using the exclude_dist parameter. Update your SPL search parameters to include at least one distribution type and run the search again."
)
elif exclude_string:
raise RuntimeError(
'The exclude_dist parameter can only be used when dist=auto. Update your SPL search parameters and run the search again.'
)
self._metric = self._params.get('metric', DistanceMetric.WASSERSTEIN)
acceptable_metrics = [DistanceMetric.KOLMOGOROV_SMIRNOV, DistanceMetric.WASSERSTEIN]
if self._metric not in acceptable_metrics:
msg = 'Invalid value error: metric must be one of {}, but found metric="{}".'
metrics = ', '.join(['\"{}\"'.format(x) for x in acceptable_metrics])
raise RuntimeError(msg.format(metrics, self._metric))
self._distance = None
# the value of self._dist is either a single instance of ProbabilityDistribution
# (if no by-clause) is used or a map of groups to instances of
# ProbabilityDistribution
self._dist = None
mlspl_limits = options.get('mlspl_limits', {})
# threshold is a tuple of floats even if there is only one value
self._threshold = OutlierThreshold(
threshold=self._params.get('threshold'),
lower=self._params.get('lower_threshold'),
upper=self._params.get('upper_threshold'),
default_threshold=(float(mlspl_limits.get('default_prob_threshold', 0.01)),),
)
max_threshold_num = mlspl_limits.get('max_threshold_num', 5)
try:
max_threshold_num = int(max_threshold_num)
except:
raise RuntimeError(
'"max_threshold_num" must be an integer. Found "max_threshold_num"={}.'.format(
max_threshold_num
)
)
if max_threshold_num < 0:
msg = '"max_threshold_num" can not be a negative number. Found "max_threshold_num"={}.'
raise RuntimeError(msg.format(max_threshold_num))
self._check_threshold(self._threshold, max_threshold_num)
self.split_by = options.get('split_by')
show_options = self._params.get('show_options', None)
self._show_options_values = None
if show_options:
self._show_options_values = DensityFunction._get_show_options_value(
show_options, options
)
# Flag that is set to true when during `fit` data there are
# too few training points for one or more of the groups
self._warned_on_few_training_data = False
# Flag that is set to true when during `apply` we encounter
# a group that the model does not have a distribution for.
self._warned_on_missing_group = False
# Flag that is set to true when the distribution type is
# Exponential and the one of the given thresholds is lower_threshold
self._warned_on_expon_lower_threshold = False
# Flag that is set to true when the distribution type is
# Beta and the one of the given thresholds is upper_threshold
self._warned_on_beta_upper_threshold = False
@staticmethod
def _get_show_options_value(show_options, options):
dict_show_options = OrderedDict()
show_options = show_options.replace(" ", "")
absent = []
for k in show_options.split(","):
if k not in options.keys():
absent.append(k)
else:
dict_show_options[k] = options[k]
return json.dumps(dict_show_options)
@staticmethod
def _handle_options(options):
# The most common cases this if condition occurs are: the user forgets to put quotation marks at the start and end of by fields or thresholds.
# In that case the search command understands it as there are multiple feature_variables.
if len(options.get('feature_variables', [])) != 1 or 'target_variable' in options:
raise RuntimeError(
'Syntax error: If multiple values of by field or any parameter are used with comma (",") they should be given in quotation marks e.g. by "DayOfWeek,HourOfDay" or threshold="0.01,0.02".'
)
mlspl_limits = options.get('mlspl_limits', {})
max_fields_in_by_clause = int(mlspl_limits.get('max_fields_in_by_clause', 5))
if len(options.get('split_by', [])) > max_fields_in_by_clause:
raise RuntimeError(
'The number of fields in the by clause cannot exceed {}'.format(
max_fields_in_by_clause
)
)
if 'model_name' in options:
summary_fnames = [
'type',
'min',
'max',
'mean',
'std',
'cardinality',
'distance',
'other',
]
by_fields = options.get('split_by', [])
for fname in by_fields:
if fname in summary_fnames:
raise RuntimeError(
f'The field "{fname}" conflicts with summary field names "{", ".join(summary_fnames)}". '
f'Please rename "{fname}".'
)
@staticmethod
def _generate_key(field_value_tuple):
"""Generate a string from the tuple of field values,
where a tuple specifies a data group defined by the values.
"""
return '_'.join(map(str, field_value_tuple))
def _warn_on_few_training_data(self):
if self._warned_on_few_training_data:
if self.split_by:
msg = (
'Too few training points in some groups will likely result in poor '
'accuracy for those groups. Please see model summary to inspect such groups.'
)
else:
msg = 'Too few training points in data set will likely result in poor accuracy. Please see model summary.'
messages.warn(msg)
def _warn_on_dist_name_mismatches_threshold(self, dist_name, threshold):
if threshold.is_lower_upper:
if (
dist_name == DistributionName.EXPONENTIAL
and threshold.lower is not None
and not self._warned_on_expon_lower_threshold
):
messages.warn(
'Exponential Distribution type can have an outlier region only at the right tail. Ignoring "lower_threshold" value for Exponential distribution.'
)
self._warned_on_expon_lower_threshold = True
def _fit_data(self, distribution, data, mlspl_limits):
"""Fit an instance of ProbabilityDistribution over data.
Warn if there are too few data points in the data array"""
min_data_size = int(mlspl_limits.get('min_data_size_to_fit', 50))
if len(data) < min_data_size and not self._warned_on_few_training_data:
self._warned_on_few_training_data = True
self._warn_on_few_training_data()
# To fix the error of object not having "_exclude_dist" attribute, when partial_fit is used on a smaller dataset and with an earlier version of MLTK (<=5.3.0)
self._exclude_dist = self._exclude_dist if hasattr(self, '_exclude_dist') else None
distribution.fit(data.values, self._metric, self._exclude_dist)
self._warn_on_dist_name_mismatches_threshold(distribution.get_name(), self._threshold)
self._distance = distribution.distance
@staticmethod
def _get_max_groups(mlspl_limits):
max_groups = mlspl_limits.get('max_groups', 1024)
try:
return int(max_groups)
except:
raise RuntimeError(
'Invalid value for max_groups={}: max_groups (in Settings or mlspl.conf file) must be an int'.format(
max_groups
)
)
def _fit_groups(self, X, mlspl_limits):
"""Split data into groups and fit each group separately"""
groups = df_util.split_by(
X, self.split_by, max_groups=DensityFunction._get_max_groups(mlspl_limits)
)
self._dist = {}
for label, group_df in groups:
dist = get_distribution(self._dist_type, mlspl_limits)
self._fit_data(dist, group_df[self.feature_variables[0]], mlspl_limits)
# If the 'groupby' operation uses only one field, then 'label'
# is a single value, otherwise its a tuple. So, we convert the
# single value to a tuple of only one value, so we don't have to
# deal with a special case later.
if not isinstance(label, tuple):
label = (label,)
field_name_vals = {k: v for k, v in zip(self.split_by, label)}
self._dist[DensityFunction._generate_key(label)] = LabeledDistribution(
dist, field_name_vals
)
def _update_data(self, distribution, data):
"""Update an instance of ProbabilityDistribution over data."""
distribution.update(data.values)
self._distance = distribution.distance
def _update_groups(self, X, mlspl_limits):
"""Split data into groups and update each group separately"""
max_groups = DensityFunction._get_max_groups(mlspl_limits)
groups = df_util.split_by(X, self.split_by, max_groups=max_groups)
for label, group_df in groups:
if not isinstance(label, tuple):
label = (label,)
label_key = DensityFunction._generate_key(label)
if label_key in self._dist:
dist = self._dist[label_key].dist
self._update_data(dist, group_df[self.feature_variables[0]])
else:
dist = get_distribution(self._dist_type, mlspl_limits)
self._fit_data(dist, group_df[self.feature_variables[0]], mlspl_limits)
field_name_vals = {k: v for k, v in zip(self.split_by, label)}
self._dist[label_key] = LabeledDistribution(dist, field_name_vals)
if len(self._dist) > max_groups:
raise RuntimeError(
"The number of groups exceeds the max ({}) specified in mlspl.conf file".format(
max_groups
)
)
def _check_threshold(self, threshold, max_num_threshold):
"""Verify the specified threshold is acceptable"""
assert self._threshold.is_specified()
if threshold.is_multiple():
size_th = threshold.get_size()
if size_th > max_num_threshold:
raise RuntimeError(
'The maximum number of allowed thresholds are {}. Found {} thresholds.'.format(
max_num_threshold, size_th
)
)
def _check_target_field_is_numeric(self, X):
if not np.issubdtype(X[self.feature_variables[0]].dtype, np.number):
raise RuntimeError(
'Feature \"{}\" is not a numeric type'.format(self.feature_variables[0])
)
def _set_random_state(self):
random_state = self._params.get('random_state')
if random_state is not None:
logger.debug('Setting random state to %s' % random_state)
np.random.seed(random_state)
def _prepare_fit(self, df, options):
self._set_random_state()
X = df.copy()
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables + (self.split_by or []),
mlspl_limits=options.get('mlspl_limits'),
get_dummies=False,
)
mlspl_limits = options.get('mlspl_limits', {})
df_util.assert_field_present(df, self.feature_variables[0])
self._check_target_field_is_numeric(X)
return X, nans, mlspl_limits
def fit(self, df, options):
X, nans, mlspl_limits = self._prepare_fit(df, options)
if self.split_by:
self._fit_groups(X, mlspl_limits)
else:
self._dist = get_distribution(self._dist_type, mlspl_limits)
self._fit_data(self._dist, X[self.feature_variables[0]], mlspl_limits)
_params = ApplyParams(
self._params.get('show_density', False),
self._params.get('full_sample', False),
self._params.get('sample', False),
self._params.get('show_options', None),
)
new_name = options.get('output_name', None)
res = self._call_apply_on_probability_distribution(
X, nans, self._threshold, _params, new_name=new_name
)
return df_util.merge_predictions(df, res)
def partial_fit(self, df, options):
X, nans, mlspl_limits = self._prepare_fit(df, options)
if self._dist is None:
# First chunk of data. Fit model as in fit().
if self.split_by:
self._fit_groups(X, mlspl_limits)
else:
self._dist = get_distribution(self._dist_type, mlspl_limits)
self._fit_data(self._dist, X[self.feature_variables[0]], mlspl_limits)
else:
# subsequent chunks of data. Update existing model.
if self.split_by:
self._update_groups(X, mlspl_limits)
else:
self._update_data(self._dist, X[self.feature_variables[0]])
def _set_df_columns_to_none(self, X, params, new_name):
X[
make_column_name(
DensityFunction.OUTPUT_NAME,
new_name=new_name,
feature_variable=self.feature_variables[0],
)
] = None
X[make_column_name(DensityFunction.BOUNDARY_RANGES)] = None
X[make_column_name('__mv_{}'.format(DensityFunction.BOUNDARY_RANGES))] = None
if params.show_density:
X['ProbabilityDensity({})'.format(self.feature_variables[0])] = None
if params.full_sample:
X['FullSampledValue'] = None
if params.sample:
X[DensityFunction.SAMPLED_VALUE] = None
if params.show_options:
X[DensityFunction.OPTIONS] = None
def _set_columns(
self, X, results_dict, main_name, threshold, key, new_name=None, feature_variable=None
):
# N.B. See ProbabilityDensity.apply() docstring for the key values of the dictionary
if threshold.is_multiple():
if results_dict[key]:
for thrsh, values in zip(
threshold.get_multiple_thresholds(), results_dict[key]
):
X[
make_column_name(
main_name, new_name, thrsh, feature_variable=feature_variable
)
] = values
else:
for thrsh in threshold.get_multiple_thresholds():
X[
make_column_name(
main_name, new_name, thrsh, feature_variable=feature_variable
)
] = None
else:
X[make_column_name(main_name, new_name, feature_variable=feature_variable)] = (
results_dict[key][0] if results_dict[key] else None
)
def _set_df_column_values(self, X, results_dict, new_name, threshold):
"""Populate Dataframe columns from the input dictionary"""
self._set_columns(
X,
results_dict,
DensityFunction.OUTPUT_NAME,
threshold,
ProbabilityDistribution.DICT_KEY_OUTLIERS,
new_name=new_name,
feature_variable=self.feature_variables[0],
)
(
str_boundary_ranges,
multivalue_boundary_ranges,
) = self._get_multivalue_of_boundary_ranges(
results_dict[ProbabilityDistribution.DICT_KEY_BOUNDARY_RANGES], X.shape[0]
)
# add formatted boundary ranges (boundary ranges are string and each range is on a new line), each threshold gets a new column
self._set_columns(
X,
{ProbabilityDistribution.DICT_KEY_BOUNDARY_RANGES: str_boundary_ranges},
DensityFunction.BOUNDARY_RANGES,
threshold,
ProbabilityDistribution.DICT_KEY_BOUNDARY_RANGES,
)
# add multivalued fields
self._set_columns(
X,
{
'__mv_{}'.format(
ProbabilityDistribution.DICT_KEY_BOUNDARY_RANGES
): multivalue_boundary_ranges
},
'__mv_{}'.format(DensityFunction.BOUNDARY_RANGES),
threshold,
'__mv_{}'.format(ProbabilityDistribution.DICT_KEY_BOUNDARY_RANGES),
)
densities = results_dict.get(ProbabilityDistribution.DICT_KEY_DENSITIES)
if densities is not None:
X['ProbabilityDensity({})'.format(self.feature_variables[0])] = densities
samples = results_dict.get(ProbabilityDistribution.DICT_KEY_SAMPLE)
full_samples = results_dict.get(ProbabilityDistribution.DICT_KEY_FULL_SAMPLE)
if full_samples is not None:
X[ProbabilityDistribution.DICT_KEY_FULL_SAMPLE] = full_samples
if samples is not None:
self._set_columns(
X,
results_dict,
main_name=DensityFunction.SAMPLED_VALUE,
threshold=threshold,
key=ProbabilityDistribution.DICT_KEY_SAMPLE,
)
if self._show_options_values is not None:
X[DensityFunction.OPTIONS] = np.full(X.shape[0], self._show_options_values).astype(
object
)
def _apply_groups(self, X, threshold, params, new_name):
"""Split data into groups and apply the corresponding distribution to each group separately"""
# Find the right label for the group by taking the relevant field values
# of the first row
label = tuple(X[field_val].values[0] for field_val in self.split_by)
try:
dist = self._dist[DensityFunction._generate_key(label)].dist
except KeyError:
# No stored group found for this data, so we set the value to an empty string
if not self._warned_on_missing_group:
self._warned_on_missing_group = True
messages.warn('Some of the data groups do not have a distribution in the model')
self._set_df_columns_to_none(X, params=params, new_name=new_name)
else:
apply = dist.apply(X[self.feature_variables[0]], threshold=threshold, params=params)
self._set_df_column_values(
X, results_dict=apply, new_name=new_name, threshold=threshold
)
return X
def _get_columns(self, main_name, threshold, new_name=None, feature_variable=None):
field_names = [make_column_name(main_name, new_name, feature_variable=feature_variable)]
if threshold.is_multiple():
field_names = [
make_column_name(
main_name, new_name, thrshld, feature_variable=self.feature_variables[0]
)
for thrshld in threshold.get_multiple_thresholds()
]
return field_names
def _get_df_column_names(self, params, new_name, threshold):
"""Get target DataFrame column name.
NOTE: The order of these column names are important in the way
we extract/populate values.
"""
field_names = self._get_columns(
main_name='IsOutlier',
threshold=threshold,
new_name=new_name,
feature_variable=self.feature_variables[0],
)
field_names.extend(
self._get_columns(main_name=DensityFunction.BOUNDARY_RANGES, threshold=threshold)
)
field_names.extend(
self._get_columns(
main_name='__mv_{}'.format(DensityFunction.BOUNDARY_RANGES), threshold=threshold
)
)
if params.show_density:
field_names.append('ProbabilityDensity({})'.format(self.feature_variables[0]))
if params.full_sample:
field_names.append('FullSampledValue')
if params.sample:
field_names.extend(
self._get_columns(main_name=DensityFunction.SAMPLED_VALUE, threshold=threshold)
)
if params.show_options:
field_names.append(DensityFunction.OPTIONS)
return field_names
def _get_multivalue_of_boundary_ranges(self, boundary_ranges, size):
"""
Reformat the boundary ranges by replacing the list value with a string where the values in the list are separated with a comma.
Also create the multivalue encodings for the reformatted boundary ranges.
Replicate string values and multivalue encodings size times
Example for reformatted boundary ranges:
boundary_ranges = {
0.03: [[-np.inf, 6.9617, 0.0127], [9.0031,9.2788,0.0127], [10.6631, np.inf, 0.0058]],
0.005: [[-np.inf,6.6919,0.0032],[10.7804,np.inf,0.0018]]
}
--->
str_boundary_list = [
-Infinity:6.9617:0.0127\n
9.0031:9.2788:0.0127\n
10.6631:Infinity:0.0058,
-Infinity:6.6919:0.0032\n
10.7804:Infinity:0.0018
]
multivalue_boundary_list = [
'$-Infinity:6.9617:0.0127$;
$9.0031:9.2788:0.0127$;
$10.6631:Infinity:0.0058$,
$-Infinity:6.6919:0.0032$;
$10.7804:Infinity:0.0018$
]
Args:
boundary_ranges (dict): the value to encode
size (int): Size of the array of output values, boundary ranges are replicated by size.
Returns:
(tuple): str_boundary_list is a list containing the reformatted version of boundary ranges
multivalue_boundary_list is a list containing the encoded multivalues of the str_boundary_list elements
"""
str_boundary_list = []
multivalue_boundary_list = []
for value in boundary_ranges.values():
val_list = []
for l in value:
line = '{}:{}:{}'.format(safe_str_infinity(l[0]), safe_str_infinity(l[1]), l[2])
val_list.append(line)
boundary_value = '\n'.join(val_list)
encoded_multivalue = ";".join(map(multivalue_encode, val_list))
str_boundary_list.append(np.full(size, boundary_value).astype(object))
multivalue_boundary_list.append(np.full(size, encoded_multivalue).astype(object))
return str_boundary_list, multivalue_boundary_list
def _call_apply_on_probability_distribution(self, X, nans, threshold, params, new_name):
column_names = self._get_df_column_names(
params=params, new_name=new_name, threshold=threshold
)
if self.split_by:
transformed = df_util.split_by(X, self.split_by).apply(
partial(
self._apply_groups, threshold=threshold, params=params, new_name=new_name
)
)
applied = transformed[column_names].values
else:
tmp_applied = self._dist.apply(
X[self.feature_variables[0]], threshold, params=params
)
applied = [elem for elem in tmp_applied['Outliers']]
(
str_boundary_ranges,
multivalue_boundary_ranges,
) = self._get_multivalue_of_boundary_ranges(
tmp_applied['BoundaryRanges'], len(applied[0])
)
applied.extend(str_boundary_ranges)
applied.extend(multivalue_boundary_ranges)
end_point = len(tmp_applied)
# SampledValue list exists at the end of the output list and it might consist of one or multiple lists.
# We want to append those SampledValue lists not the list which is containing them.
# We go up to the last element in the output list and handle the last element specially.
if params.sample:
end_point = end_point - 1
# Since the first two elements, Outliers and BoundaryRanges, are already added start with index 2.
for elem in itertoolz.take(end_point - 2, itertoolz.drop(2, tmp_applied.items())):
applied.append(elem[1])
if params.sample:
for elem in tmp_applied[DensityFunction.SAMPLED_VALUE]:
applied.append(elem)
if params.show_options:
applied.append(np.full(X.shape[0], self._show_options_values).astype(object))
applied = np.stack(applied, axis=1)
return df_util.create_output_dataframe(
y_hat=applied, nans=nans, output_names=column_names
)
def _make_threshold_tuple(self):
"""Updates the self._threshold type as a tuple if it is a float.
The type of threshold is float in the old models, whereas it is tuple in the new models.
This method is to make our models backward compatible after the changes of multiple threshold support.
It helps us to process forward in apply when we see a model whose threshold is a float.
"""
if self._threshold.threshold is not None and type(self._threshold.threshold) == float:
self._threshold.threshold = (self._threshold.threshold,)
if self._threshold.lower is not None and type(self._threshold.lower) == float:
self._threshold.lower = (self._threshold.lower,)
if self._threshold.upper is not None and type(self._threshold.upper) == float:
self._threshold.upper = (self._threshold.upper,)
def apply(self, df, options):
mlspl_limits = options.get('mlspl_limits', {})
params = convert_params(
options.get('params', {}),
strs=['dist', 'metric', 'show_options', 'exclude_dist'],
bools=['show_density', 'full_sample', 'sample'],
multiple_floats=['threshold', 'lower_threshold', 'upper_threshold'],
ints=['random_state'],
)
_params = ApplyParams(
params.get('show_density', False),
params.get('full_sample', False),
params.get('sample', False),
params.get('show_options', None),
)
threshold = OutlierThreshold(
params.get('threshold'),
lower=params.get('lower_threshold'),
upper=params.get('upper_threshold'),
)
self._make_threshold_tuple()
threshold = threshold if threshold.is_specified() else self._threshold
self._check_threshold(threshold, int(mlspl_limits.get('max_threshold_num', 5)))
_dist = params.get('dist')
if _dist and _dist != self._dist_type:
msg = 'Invalid parameter for apply: Discarding "dist" value. Using dist={}.'
cexc.messages.warn(msg.format(self._dist_type))
_metric = params.get('metric')
if _metric and _metric != self._metric:
msg = 'Invalid parameter for apply: Discarding "metric" value. Using metric={}.'
cexc.messages.warn(msg.format(self._metric))
show_options = params.get('show_options', None)
self._show_options_values = None
if show_options:
self._show_options_values = DensityFunction._get_show_options_value(
show_options, options
)
X = df.copy()
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables + (self.split_by or []),
mlspl_limits=mlspl_limits,
get_dummies=False,
)
df_util.assert_field_present(X, self.feature_variables[0])
self._check_target_field_is_numeric(X)
new_name = options.get('output_name', None)
self._warn_on_few_training_data()
res = self._call_apply_on_probability_distribution(
X, nans, threshold, params=_params, new_name=new_name
)
return df_util.merge_predictions(df, res)
def summary(self, options):
if self.split_by:
# Columns of the summary are:
# 1) Fields that were used to group data
# 2) Parameters of the fitted distribution
groups = defaultdict(list)
for group in self._dist.values():
for field, value in chain(
iter(group.fields.items()), iter(group.dist.summary().items())
):
groups[field].append(value)
return pd.DataFrame({k: pd.Series(v) for k, v in groups.items()})
else:
return pd.DataFrame({param: [val] for param, val in self._dist.summary().items()})
@staticmethod
def register_codecs():
codecs_manager.add_codec('algos.DensityFunction', 'DensityFunction', SimpleObjectCodec)
codecs_manager.add_codec(
'algos_support.density_function.distribution_factory',
'AutoSelectDistribution',
SimpleObjectCodec,
)
codecs_manager.add_codec(
'algos_support.density_function.exponential_distribution',
'ExponentialDistribution',
SimpleObjectCodec,
)
codecs_manager.add_codec(
'algos_support.density_function.kde_distribution',
'KDEDistribution',
SimpleObjectCodec,
)
codecs_manager.add_codec(
'algos.DensityFunction', 'LabeledDistribution', SimpleObjectCodec
)
codecs_manager.add_codec(
'algos_support.density_function.normal_distribution',
'NormalDistribution',
SimpleObjectCodec,
)
codecs_manager.add_codec(
'algos_support.density_function.outlier_threshold',
'OutlierThreshold',
SimpleObjectCodec,
)
codecs_manager.add_codec(
'algos_support.density_function.beta_distribution',
'BetaDistribution',
SimpleObjectCodec,
)