You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/algos/LocalOutlierFactor.py

133 lines
5.5 KiB

#!/user/bin/env python
from sklearn.neighbors import LocalOutlierFactor as _LocalOutlierFactor
from base import ClustererMixin, BaseAlgo
from util import df_util
from util.param_util import convert_params
class LocalOutlierFactor(ClustererMixin, BaseAlgo):
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
ints=['n_neighbors', 'leaf_size', 'p'],
floats=['contamination'],
strs=['algorithm', 'metric'],
bools=['anomaly_score'],
)
# dump the value of anomaly score parameter in a local variable, to retrieve it later
self.return_scores = out_params.pop('anomaly_score', True)
# whitelist n_neighbors should be > 0
if 'n_neighbors' in out_params and out_params['n_neighbors'] <= 0:
msg = 'Invalid value error: n_neighbors must be greater than 0, but found n_neighbors="{}".'
raise RuntimeError(msg.format(out_params['n_neighbors']))
# whitelist leaf_size should be >= 1
if 'leaf_size' in out_params and out_params['leaf_size'] < 1:
msg = 'Invalid value error: leaf_size must be greater than or equal to 1, but found leaf_size="{}".'
raise RuntimeError(msg.format(out_params['leaf_size']))
# whitelist valid values for algorithm, as error raised by sklearn for invalid values is uninformative
valid_algorithms = ['brute', 'kd_tree', 'ball_tree', 'auto']
if 'algorithm' in out_params and out_params['algorithm'] not in valid_algorithms:
msg = (
'Invalid value error: Valid values for algorithm are "brute", "kd_tree", "ball_tree", "auto", '
'but found algorithm="{}".'
)
raise RuntimeError(msg.format(out_params['algorithm']))
# whitelist valid values for metric relative to algorithm, as error raised by sklearn for invalid values
if 'metric' in out_params:
# Default value of the algorithm is 'auto'
self.check_valid_algorithm_metric_combination(
out_params.get('algorithm', 'auto'), out_params['metric']
)
# whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
msg = (
'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
'but found contamination="{}".'
)
raise RuntimeError(msg.format(out_params['contamination']))
# whitelist p should be >= 1 for minkowski metric
if (
'p' in out_params
and (out_params['p'] < 1)
and out_params.get('metric', 'minkowski') == 'minkowski'
):
msg = 'Invalid value error: p must be greater than or equal to 1 for minkowski metric, but found p="{}".'
raise RuntimeError(msg.format(out_params['p']))
self.estimator = _LocalOutlierFactor(**out_params)
@staticmethod
def check_valid_algorithm_metric_combination(algorithm, metric):
"""Check if the provided metric is valid for the algorithm, raise an error if not."""
kd_tree_metric = [
'cityblock',
'euclidean',
'l1',
'l2',
'manhattan',
'chebyshev',
'minkowski',
]
ball_tree_metric = kd_tree_metric + [
'braycurtis',
'canberra',
'dice',
'hamming',
'jaccard',
'kulsinski',
'matching',
'rogerstanimoto',
'russellrao',
'sokalmichener',
'sokalsneath',
]
all_valid_metric = ball_tree_metric + ['cosine', 'correlation', 'sqeuclidean', 'yule']
# all metrics are valid for brute and auto since auto already takes care of algorithm-metric match
valid_algorithm_metric = {
'brute': all_valid_metric,
'auto': all_valid_metric,
'ball_tree': ball_tree_metric,
'kd_tree': kd_tree_metric,
}
if not (metric in valid_algorithm_metric[algorithm]):
msg = (
'Invalid value error: metric "{}" is invalid for algorithm "{}". Please see documentation '
'for a complete list of valid algorithm-metric combinations.'
)
raise RuntimeError(msg.format(metric, algorithm))
def fit(self, df, options):
# Make a copy of data, to not alter original data frame
X = df.copy()
X, nans, _ = df_util.prepare_features(
X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
)
# y_hat is 1d array of inliers/outliers in [1, -1], respectively.
# inverting y_hat to represent outliers as '1', inliers as '-1' for consistency.
y_hat = self.estimator.fit_predict(X.values) * -1
default_name = 'isOutlier'
output_name = options.get('output_name', default_name)
# Code segment to include negative outlier score in the output.
if self.return_scores:
anomaly_scores = self.estimator.negative_outlier_factor_.round(2)
output_name = [output_name] + ['anomaly_score']
y_hat = list(zip(y_hat, anomaly_scores))
output = df_util.create_output_dataframe(
y_hat=y_hat, nans=nans, output_names=output_name
)
df = df_util.merge_predictions(df, output)
return df