Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/algos/LocalOutlierFactor.py

#!/user/bin/env python

from sklearn.neighbors import LocalOutlierFactor as _LocalOutlierFactor

from base import ClustererMixin, BaseAlgo
from util import df_util
from util.param_util import convert_params


class LocalOutlierFactor(ClustererMixin, BaseAlgo):
    def __init__(self, options):
        self.handle_options(options)
        out_params = convert_params(
            options.get('params', {}),
            ints=['n_neighbors', 'leaf_size', 'p'],
            floats=['contamination'],
            strs=['algorithm', 'metric'],
            bools=['anomaly_score'],
        )
        # dump the value of anomaly score parameter in a local variable, to retrieve it later
        self.return_scores = out_params.pop('anomaly_score', True)

        #   whitelist n_neighbors should be > 0
        if 'n_neighbors' in out_params and out_params['n_neighbors'] <= 0:
            msg = 'Invalid value error: n_neighbors must be greater than 0, but found n_neighbors="{}".'
            raise RuntimeError(msg.format(out_params['n_neighbors']))

        #   whitelist leaf_size should be >= 1
        if 'leaf_size' in out_params and out_params['leaf_size'] < 1:
            msg = 'Invalid value error: leaf_size must be greater than or equal to 1, but found leaf_size="{}".'
            raise RuntimeError(msg.format(out_params['leaf_size']))

        #   whitelist valid values for algorithm, as error raised by sklearn for invalid values is uninformative
        valid_algorithms = ['brute', 'kd_tree', 'ball_tree', 'auto']
        if 'algorithm' in out_params and out_params['algorithm'] not in valid_algorithms:
            msg = (
                'Invalid value error: Valid values for algorithm are "brute", "kd_tree", "ball_tree", "auto", '
                'but found algorithm="{}".'
            )
            raise RuntimeError(msg.format(out_params['algorithm']))

        #   whitelist valid values for metric relative to algorithm, as error raised by sklearn for invalid values
        if 'metric' in out_params:
            # Default value of the algorithm is 'auto'
            self.check_valid_algorithm_metric_combination(
                out_params.get('algorithm', 'auto'), out_params['metric']
            )

        #   whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
        if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
            msg = (
                'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
                'but found contamination="{}".'
            )
            raise RuntimeError(msg.format(out_params['contamination']))

        #   whitelist p should be >= 1 for minkowski metric
        if (
            'p' in out_params
            and (out_params['p'] < 1)
            and out_params.get('metric', 'minkowski') == 'minkowski'
        ):
            msg = 'Invalid value error: p must be greater than or equal to 1 for minkowski metric, but found p="{}".'
            raise RuntimeError(msg.format(out_params['p']))

        self.estimator = _LocalOutlierFactor(**out_params)

    @staticmethod
    def check_valid_algorithm_metric_combination(algorithm, metric):
        """Check if the provided metric is valid for the algorithm, raise an error if not."""
        kd_tree_metric = [
            'cityblock',
            'euclidean',
            'l1',
            'l2',
            'manhattan',
            'chebyshev',
            'minkowski',
        ]
        ball_tree_metric = kd_tree_metric + [
            'braycurtis',
            'canberra',
            'dice',
            'hamming',
            'jaccard',
            'kulsinski',
            'matching',
            'rogerstanimoto',
            'russellrao',
            'sokalmichener',
            'sokalsneath',
        ]
        all_valid_metric = ball_tree_metric + ['cosine', 'correlation', 'sqeuclidean', 'yule']

        # all metrics are valid for brute and auto since auto already takes care of algorithm-metric match
        valid_algorithm_metric = {
            'brute': all_valid_metric,
            'auto': all_valid_metric,
            'ball_tree': ball_tree_metric,
            'kd_tree': kd_tree_metric,
        }
        if not (metric in valid_algorithm_metric[algorithm]):
            msg = (
                'Invalid value error: metric "{}" is invalid for algorithm "{}". Please see documentation '
                'for a complete list of valid algorithm-metric combinations.'
            )
            raise RuntimeError(msg.format(metric, algorithm))

    def fit(self, df, options):
        #   Make a copy of data, to not alter original data frame
        X = df.copy()
        X, nans, _ = df_util.prepare_features(
            X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits')
        )

        #   y_hat is 1d array of inliers/outliers in [1, -1], respectively.
        #   inverting y_hat to represent outliers as '1', inliers as '-1' for consistency.
        y_hat = self.estimator.fit_predict(X.values) * -1
        default_name = 'isOutlier'
        output_name = options.get('output_name', default_name)

        #  Code segment to include negative outlier score in the output.
        if self.return_scores:
            anomaly_scores = self.estimator.negative_outlier_factor_.round(2)
            output_name = [output_name] + ['anomaly_score']
            y_hat = list(zip(y_hat, anomaly_scores))

        output = df_util.create_output_dataframe(
            y_hat=y_hat, nans=nans, output_names=output_name
        )
        df = df_util.merge_predictions(df, output)
        return df