You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
285 lines
12 KiB
285 lines
12 KiB
#!/usr/bin/env python
|
|
|
|
import json
|
|
from collections import OrderedDict
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from algos_support.density_function.distance_metric import DistanceMetric
|
|
from algos_support.density_function.outlier_threshold import OutlierThreshold
|
|
from algos_support.density_function.probability_distribution import (
|
|
ApplyParams,
|
|
DistributionType,
|
|
DistributionName,
|
|
ProbabilityDistribution,
|
|
)
|
|
from algos.DensityFunction import DensityFunction
|
|
from algos.PCA import PCA
|
|
from algos.StandardScaler import StandardScaler
|
|
|
|
from base import BaseAlgo
|
|
from codec import codecs_manager
|
|
from codec.codecs import SimpleObjectCodec
|
|
from util import df_util
|
|
from util.constants import NUM_PCA_COMPONENTS
|
|
from util.param_util import convert_params
|
|
import cexc
|
|
|
|
logger = cexc.get_logger(__name__)
|
|
messages = cexc.get_messages_logger()
|
|
|
|
|
|
class MultivariateOutlierDetection(BaseAlgo):
|
|
def __init__(self, options):
|
|
MultivariateOutlierDetection._handle_options(options)
|
|
|
|
self._params = convert_params(
|
|
options.get('params', {}),
|
|
strs=['dist', 'metric', 'show_options', "exclude_dist"],
|
|
bools=['show_density', 'full_sample', 'sample'],
|
|
multiple_floats=['threshold', 'lower_threshold', 'upper_threshold'],
|
|
ints=['random_state'],
|
|
)
|
|
acceptable_dists = (
|
|
DistributionType.AUTO,
|
|
DistributionType.NORMAL,
|
|
DistributionType.EXPONENTIAL,
|
|
DistributionType.GAUSSIAN_KDE,
|
|
DistributionType.BETA,
|
|
)
|
|
acceptable_exclude_dist = (
|
|
DistributionType.NORMAL,
|
|
DistributionType.EXPONENTIAL,
|
|
DistributionType.GAUSSIAN_KDE,
|
|
DistributionType.BETA,
|
|
)
|
|
self._dist_type = self._params.pop('dist', DistributionType.AUTO)
|
|
if self._dist_type not in acceptable_dists:
|
|
msg = 'Invalid value error: dist must be one of {}, but found dist="{}".'
|
|
dists = ', '.join(['\"{}\"'.format(x) for x in acceptable_dists])
|
|
raise RuntimeError(msg.format(dists, self._dist_type))
|
|
|
|
self._exclude_dist = None
|
|
exclude_string = self._params.pop('exclude_dist', None)
|
|
if self._dist_type == DistributionType.AUTO and exclude_string:
|
|
self._exclude_dist = exclude_string.split(",")
|
|
excludes = ', '.join(['\"{}\"'.format(x) for x in acceptable_exclude_dist])
|
|
for i in range(len(self._exclude_dist)):
|
|
self._exclude_dist[i] = self._exclude_dist[i].strip()
|
|
if self._exclude_dist[i] not in acceptable_exclude_dist:
|
|
msg = 'Invalid value error: exclude_dist must be one or more of {}, but found "{}" in exclude_dist.'
|
|
raise RuntimeError(msg.format(excludes, self._exclude_dist[i]))
|
|
if sorted(self._exclude_dist) == sorted(list(acceptable_exclude_dist)):
|
|
raise RuntimeError(
|
|
f"You cannot exclude all of the distribution types when using the exclude_dist parameter. Update your SPL search parameters to include at least one distribution type and run the search again."
|
|
)
|
|
elif exclude_string:
|
|
raise RuntimeError(
|
|
'The exclude_dist parameter can only be used when dist=auto. Update your SPL search parameters and run the search again.'
|
|
)
|
|
|
|
self._metric = self._params.get('metric', DistanceMetric.WASSERSTEIN)
|
|
acceptable_metrics = [DistanceMetric.KOLMOGOROV_SMIRNOV, DistanceMetric.WASSERSTEIN]
|
|
if self._metric not in acceptable_metrics:
|
|
msg = 'Invalid value error: metric must be one of {}, but found metric="{}".'
|
|
metrics = ', '.join(['\"{}\"'.format(x) for x in acceptable_metrics])
|
|
raise RuntimeError(msg.format(metrics, self._metric))
|
|
|
|
self._distance = None
|
|
# the value of self._dist is either a single instance of ProbabilityDistribution
|
|
# (if no by-clause) is used or a map of groups to instances of
|
|
# ProbabilityDistribution
|
|
self._dist = None
|
|
mlspl_limits = options.get('mlspl_limits', {})
|
|
|
|
# threshold is a tuple of floats even if there is only one value
|
|
self._threshold = OutlierThreshold(
|
|
threshold=self._params.get('threshold'),
|
|
lower=self._params.get('lower_threshold'),
|
|
upper=self._params.get('upper_threshold'),
|
|
default_threshold=(float(mlspl_limits.get('default_prob_threshold', 0.01)),),
|
|
)
|
|
max_threshold_num = mlspl_limits.get('max_threshold_num', 5)
|
|
try:
|
|
max_threshold_num = int(max_threshold_num)
|
|
except:
|
|
raise RuntimeError(
|
|
'"max_threshold_num" must be an integer. Found "max_threshold_num"={}.'.format(
|
|
max_threshold_num
|
|
)
|
|
)
|
|
if max_threshold_num < 0:
|
|
msg = '"max_threshold_num" can not be a negative number. Found "max_threshold_num"={}.'
|
|
raise RuntimeError(msg.format(max_threshold_num))
|
|
self._check_threshold(self._threshold, max_threshold_num)
|
|
|
|
self.split_by = options.get('split_by')
|
|
|
|
show_options = self._params.get('show_options', None)
|
|
self._show_options_values = None
|
|
if show_options:
|
|
self._show_options_values = DensityFunction._get_show_options_value(
|
|
show_options, options
|
|
)
|
|
|
|
# Flag that is set to true when during `fit` data there are
|
|
# too few training points for one or more of the groups
|
|
self._warned_on_few_training_data = False
|
|
# Flag that is set to true when during `apply` we encounter
|
|
# a group that the model does not have a distribution for.
|
|
self._warned_on_missing_group = False
|
|
# Flag that is set to true when the distribution type is
|
|
# Exponential and the one of the given thresholds is lower_threshold
|
|
self._warned_on_expon_lower_threshold = False
|
|
# Flag that is set to true when the distribution type is
|
|
# Beta and the one of the given thresholds is upper_threshold
|
|
self._warned_on_beta_upper_threshold = False
|
|
|
|
def _check_threshold(self, threshold, max_num_threshold):
|
|
"""Verify the specified threshold is acceptable"""
|
|
assert self._threshold.is_specified()
|
|
if threshold.is_multiple():
|
|
size_th = threshold.get_size()
|
|
if size_th > max_num_threshold:
|
|
raise RuntimeError(
|
|
'The maximum number of allowed thresholds are {}. Found {} thresholds.'.format(
|
|
max_num_threshold, size_th
|
|
)
|
|
)
|
|
|
|
def _set_random_state(self):
|
|
random_state = self._params.get('random_state')
|
|
if random_state is not None:
|
|
logger.debug('Setting random state to %s' % random_state)
|
|
np.random.seed(random_state)
|
|
|
|
def _check_target_field_is_numeric(self, X):
|
|
if not np.issubdtype(X[self.feature_variables[0]].dtype, np.number):
|
|
raise RuntimeError(
|
|
'Feature \"{}\" is not a numeric type'.format(self.feature_variables[0])
|
|
)
|
|
|
|
@staticmethod
|
|
def _get_show_options_value(show_options, options):
|
|
dict_show_options = OrderedDict()
|
|
show_options = show_options.replace(" ", "")
|
|
absent = []
|
|
for k in show_options.split(","):
|
|
if k not in options.keys():
|
|
absent.append(k)
|
|
else:
|
|
dict_show_options[k] = options[k]
|
|
return json.dumps(dict_show_options)
|
|
|
|
@staticmethod
|
|
def _handle_options(options):
|
|
if len(options.get('feature_variables', [])) == 1:
|
|
messages.warning(
|
|
'Outlier detection on univariate data can be better provided using the DensityFunction algorithm rather than MultivariateOutlierDetection.'
|
|
)
|
|
mlspl_limits = options.get('mlspl_limits', {})
|
|
max_fields_in_by_clause = int(mlspl_limits.get('max_fields_in_by_clause', 5))
|
|
if len(options.get('split_by', [])) > max_fields_in_by_clause:
|
|
raise RuntimeError(
|
|
'The number of fields in the by clause cannot exceed {}'.format(
|
|
max_fields_in_by_clause
|
|
)
|
|
)
|
|
if 'model_name' in options:
|
|
summary_fnames = [
|
|
'type',
|
|
'min',
|
|
'max',
|
|
'mean',
|
|
'std',
|
|
'cardinality',
|
|
'distance',
|
|
'other',
|
|
]
|
|
by_fields = options.get('split_by', [])
|
|
for fname in by_fields:
|
|
if fname in summary_fnames:
|
|
raise RuntimeError(
|
|
f'The field "{fname}" conflicts with summary field names "{", ".join(summary_fnames)}". '
|
|
f'Please rename "{fname}".'
|
|
)
|
|
|
|
def _prepare_fit(self, df, options):
|
|
self._set_random_state()
|
|
mlspl_limits = options.get('mlspl_limits', {})
|
|
# Scale the dataset first with StandardScaler
|
|
ss_options = options.copy()
|
|
# remove the as clause from StandardScaler if described, it will be used in DensityFunction
|
|
ss_options.pop('output_name', None)
|
|
ss_options['params'] = {}
|
|
ss_estimator = StandardScaler(ss_options)
|
|
ss_estimator.feature_variables = self.feature_variables.copy()
|
|
ss_estimator.fit(df, ss_options)
|
|
self.ss_estimator = ss_estimator
|
|
ss_output = self.ss_estimator.apply(df, ss_options)
|
|
|
|
# Run PCA on the scaled dataset and get a single principle component
|
|
pca_options = options.copy()
|
|
# remove the as clause from PCA if described, it will be used in DensityFunction
|
|
pca_options.pop('output_name', None)
|
|
pca_options['params'] = {}
|
|
pca_options['params']['k'] = NUM_PCA_COMPONENTS
|
|
|
|
pca_estimator = PCA(pca_options)
|
|
pca_estimator.feature_variables = [
|
|
'SS_' + feature for feature in self.feature_variables.copy()
|
|
]
|
|
pca_estimator.fit(ss_output, pca_options)
|
|
self.pca_estimator = pca_estimator
|
|
pca_output = self.pca_estimator.apply(ss_output, pca_options)
|
|
|
|
# Create DensityFunction object with its single feature_variable being the first principle component from PCA output
|
|
pca_output_options = options.copy()
|
|
pca_output_options['feature_variables'] = ['PC_1']
|
|
pca_output_options['args'] = ['PC_1']
|
|
den_func = DensityFunction(pca_output_options)
|
|
den_func.feature_variables = pca_output_options['feature_variables']
|
|
df_util.assert_field_present(pca_output, den_func.feature_variables[0])
|
|
self._check_target_field_is_numeric(pca_output)
|
|
|
|
return pca_output, den_func, mlspl_limits, pca_output_options
|
|
|
|
def fit(self, df, options):
|
|
pca_output, den_func, mlspl_limits, pca_output_options = self._prepare_fit(df, options)
|
|
den_func_output = den_func.fit(pca_output, pca_output_options)
|
|
self.den_func = den_func
|
|
return den_func_output
|
|
|
|
def partial_fit(self, df, options):
|
|
pca_output, den_func, mlspl_limits, pca_output_options = self._prepare_fit(df, options)
|
|
den_func_output = den_func.partial_fit(pca_output, pca_output_options)
|
|
self.den_func = den_func
|
|
return den_func_output
|
|
|
|
def apply(self, df, options):
|
|
df = self.ss_estimator.apply(df, options)
|
|
df = self.pca_estimator.apply(df, options)
|
|
pca_output_options = options.copy()
|
|
pca_output_options['feature_variables'] = ['PC_1']
|
|
pca_output_options['args'] = ['PC_1']
|
|
return self.den_func.apply(df, pca_output_options)
|
|
|
|
def summary(self, options):
|
|
den_func_df = self.den_func.summary(options)
|
|
pca_df = self.pca_estimator.summary(options)
|
|
pca_df = pd.concat([pca_df] * len(den_func_df), ignore_index=True)
|
|
den_func_pca_combined_df = pd.concat([den_func_df, pca_df], axis=1)
|
|
return den_func_pca_combined_df
|
|
|
|
@staticmethod
|
|
def register_codecs():
|
|
codecs_manager.add_codec(
|
|
'algos.MultivariateOutlierDetection',
|
|
'MultivariateOutlierDetection',
|
|
SimpleObjectCodec,
|
|
)
|
|
DensityFunction.register_codecs()
|
|
PCA.register_codecs()
|
|
StandardScaler.register_codecs()
|