You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
5.5 KiB
133 lines
5.5 KiB
#!/user/bin/env python
|
|
|
|
from sklearn.neighbors import LocalOutlierFactor as _LocalOutlierFactor
|
|
|
|
from base import ClustererMixin, BaseAlgo
|
|
from util import df_util
|
|
from util.param_util import convert_params
|
|
|
|
|
|
class LocalOutlierFactor(ClustererMixin, BaseAlgo):
|
|
def __init__(self, options):
|
|
self.handle_options(options)
|
|
out_params = convert_params(
|
|
options.get('params', {}),
|
|
ints=['n_neighbors', 'leaf_size', 'p'],
|
|
floats=['contamination'],
|
|
strs=['algorithm', 'metric'],
|
|
bools=['anomaly_score'],
|
|
)
|
|
# dump the value of anomaly score parameter in a local variable, to retrieve it later
|
|
self.return_scores = out_params.pop('anomaly_score', True)
|
|
|
|
# whitelist n_neighbors should be > 0
|
|
if 'n_neighbors' in out_params and out_params['n_neighbors'] <= 0:
|
|
msg = 'Invalid value error: n_neighbors must be greater than 0, but found n_neighbors="{}".'
|
|
raise RuntimeError(msg.format(out_params['n_neighbors']))
|
|
|
|
# whitelist leaf_size should be >= 1
|
|
if 'leaf_size' in out_params and out_params['leaf_size'] < 1:
|
|
msg = 'Invalid value error: leaf_size must be greater than or equal to 1, but found leaf_size="{}".'
|
|
raise RuntimeError(msg.format(out_params['leaf_size']))
|
|
|
|
# whitelist valid values for algorithm, as error raised by sklearn for invalid values is uninformative
|
|
valid_algorithms = ['brute', 'kd_tree', 'ball_tree', 'auto']
|
|
if 'algorithm' in out_params and out_params['algorithm'] not in valid_algorithms:
|
|
msg = (
|
|
'Invalid value error: Valid values for algorithm are "brute", "kd_tree", "ball_tree", "auto", '
|
|
'but found algorithm="{}".'
|
|
)
|
|
raise RuntimeError(msg.format(out_params['algorithm']))
|
|
|
|
# whitelist valid values for metric relative to algorithm, as error raised by sklearn for invalid values
|
|
if 'metric' in out_params:
|
|
# Default value of the algorithm is 'auto'
|
|
self.check_valid_algorithm_metric_combination(
|
|
out_params.get('algorithm', 'auto'), out_params['metric']
|
|
)
|
|
|
|
# whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
|
|
if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
|
|
msg = (
|
|
'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
|
|
'but found contamination="{}".'
|
|
)
|
|
raise RuntimeError(msg.format(out_params['contamination']))
|
|
|
|
# whitelist p should be >= 1 for minkowski metric
|
|
if (
|
|
'p' in out_params
|
|
and (out_params['p'] < 1)
|
|
and out_params.get('metric', 'minkowski') == 'minkowski'
|
|
):
|
|
msg = 'Invalid value error: p must be greater than or equal to 1 for minkowski metric, but found p="{}".'
|
|
raise RuntimeError(msg.format(out_params['p']))
|
|
|
|
self.estimator = _LocalOutlierFactor(**out_params)
|
|
|
|
@staticmethod
|
|
def check_valid_algorithm_metric_combination(algorithm, metric):
|
|
"""Check if the provided metric is valid for the algorithm, raise an error if not."""
|
|
kd_tree_metric = [
|
|
'cityblock',
|
|
'euclidean',
|
|
'l1',
|
|
'l2',
|
|
'manhattan',
|
|
'chebyshev',
|
|
'minkowski',
|
|
]
|
|
ball_tree_metric = kd_tree_metric + [
|
|
'braycurtis',
|
|
'canberra',
|
|
'dice',
|
|
'hamming',
|
|
'jaccard',
|
|
'kulsinski',
|
|
'matching',
|
|
'rogerstanimoto',
|
|
'russellrao',
|
|
'sokalmichener',
|
|
'sokalsneath',
|
|
]
|
|
all_valid_metric = ball_tree_metric + ['cosine', 'correlation', 'sqeuclidean', 'yule']
|
|
|
|
# all metrics are valid for brute and auto since auto already takes care of algorithm-metric match
|
|
valid_algorithm_metric = {
|
|
'brute': all_valid_metric,
|
|
'auto': all_valid_metric,
|
|
'ball_tree': ball_tree_metric,
|
|
'kd_tree': kd_tree_metric,
|
|
}
|
|
if not (metric in valid_algorithm_metric[algorithm]):
|
|
msg = (
|
|
'Invalid value error: metric "{}" is invalid for algorithm "{}". Please see documentation '
|
|
'for a complete list of valid algorithm-metric combinations.'
|
|
)
|
|
raise RuntimeError(msg.format(metric, algorithm))
|
|
|
|
def fit(self, df, options):
|
|
# Make a copy of data, to not alter original data frame
|
|
X = df.copy()
|
|
X, nans, _ = df_util.prepare_features(
|
|
X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
|
|
)
|
|
|
|
# y_hat is 1d array of inliers/outliers in [1, -1], respectively.
|
|
# inverting y_hat to represent outliers as '1', inliers as '-1' for consistency.
|
|
y_hat = self.estimator.fit_predict(X.values) * -1
|
|
default_name = 'isOutlier'
|
|
output_name = options.get('output_name', default_name)
|
|
|
|
# Code segment to include negative outlier score in the output.
|
|
if self.return_scores:
|
|
anomaly_scores = self.estimator.negative_outlier_factor_.round(2)
|
|
output_name = [output_name] + ['anomaly_score']
|
|
y_hat = list(zip(y_hat, anomaly_scores))
|
|
|
|
output = df_util.create_output_dataframe(
|
|
y_hat=y_hat, nans=nans, output_names=output_name
|
|
)
|
|
df = df_util.merge_predictions(df, output)
|
|
return df
|