Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/algos/HashingVectorizer.py

#!/usr/bin/env python

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer as _HashingVectorizer

import cexc
from base import BaseAlgo
from util import df_util
from util.param_util import convert_params

messages = cexc.get_messages_logger()


class HashingVectorizer(BaseAlgo):
    def handle_options(self, options):
        if (
            len(options.get('feature_variables', [])) != 1
            or len(options.get('target_variable', [])) > 0
        ):
            raise RuntimeError('Syntax error: You must specify exactly one field')

    def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            ints=['max_features', 'random_state', 'n_iters', 'k'],
            strs=['stop_words', 'analyzer', 'norm', 'token_pattern'],
            ranges=['ngram_range'],
            bools=['reduce'],
            aliases={'max_features': 'n_features', 'k': 'n_components'},
        )

        if 'k' in out_params and 'reduce' in out_params and not out_params['reduce']:
            messages.warn('k parameter is ignored when reduce is set to false.')

        # Separate the SVD parameters
        svd_params = {}
        for opt in ['random_state', 'n_iters', 'n_components']:
            if opt in out_params:
                svd_params[opt] = out_params.pop(opt)

        self.do_reduce = out_params.pop('reduce', True)
        out_params.setdefault('n_features', 10000)
        self.estimator = _HashingVectorizer(**out_params)

        svd_params.setdefault('n_components', 100)

        # Check for invalid k
        n_components = svd_params['n_components']
        n_features = out_params['n_features']

        if self.do_reduce and n_components >= n_features:
            msg = 'the number of reduced fields (k={}) must be less than the number of features (max_features={})'
            raise RuntimeError(msg.format(n_components, n_features))

        self.reducer = TruncatedSVD(**svd_params)
        self.columns = []

    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, nans, self.columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        # If X is less than the reduction dimension, we can only reduce to that at max
        length = len(X)
        if length < self.reducer.n_components and self.do_reduce:
            msg = 'Number of valid events ({}) is less than k ({}). Setting k={}.'
            messages.warn(msg.format(length, self.reducer.n_components, length))
            self.reducer.n_components = length

        X = X.values.ravel().astype('str')
        X = self.estimator.fit_transform(X)
        if self.do_reduce:
            y_hat = self.reducer.fit_transform(X)
        else:
            y_hat = X.toarray()

        output_names = self.make_output_names(options)

        output = df_util.create_output_dataframe(
            y_hat=y_hat, output_names=output_names, nans=nans
        )

        df = df_util.merge_predictions(df, output)
        return df

    def make_output_names(self, options):
        default_name = self.feature_variables[0] + '_hashed'

        if self.do_reduce:
            number_of_fields = self.reducer.n_components
        else:
            number_of_fields = self.estimator.n_features

        output_name = options.get('output_name', default_name)
        output_names = [output_name + '_' + str(i) for i in range(number_of_fields)]
        return output_names