SH-Deployer/apps/Splunk_ML_Toolkit/bin/algos/TFIDF.py

#!/usr/bin/env python

from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer

from base import BaseAlgo
from codec import codecs_manager
from util import df_util
from util.param_util import convert_params


class TFIDF(BaseAlgo):
    def handle_options(self, options):
        if (
            len(options.get('feature_variables', [])) != 1
            or len(options.get('target_variable', [])) > 0
        ):
            raise RuntimeError('Syntax error: You must specify exactly one field')

    def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            ints=['max_features'],
            strs=['max_df', 'min_df', 'stop_words', 'analyzer', 'norm', 'token_pattern'],
            ranges=['ngram_range'],
        )

        for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]:
            if doc_freq in out_params:
                # EAFP... convert max_df/min_df to float/int if it is a number.
                try:
                    float_val = float(out_params[doc_freq])
                    int_val = int(float_val)
                except ValueError:
                    raise RuntimeError(
                        'Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(
                            doc_freq=doc_freq
                        )
                    )
                if float_val == 1.0:
                    out_params[doc_freq] = default_val
                elif float_val == int_val:
                    out_params[doc_freq] = int_val
                else:
                    out_params[doc_freq] = float_val

        if 'ngram_range' in list(out_params.keys()):
            if len(out_params['ngram_range']) != 2:
                raise RuntimeError(
                    'Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5'
                )

        # TODO: Maybe let the user know that we make this change.
        out_params.setdefault('max_features', 100)

        self.estimator = _TfidfVectorizer(**out_params)

    def fit(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, _, self.columns = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        X = X.values.ravel().astype('str')
        self.estimator.fit(X)

    def make_output_names(self, options):
        default_name = self.feature_variables[0] + '_tfidf'
        output_name = options.get('output_name', default_name)
        feature_names = self.estimator.get_feature_names_out()
        output_names = [
            output_name + '_' + str(index) + '_' + word
            for (index, word) in enumerate(feature_names)
        ]
        return output_names

    def apply(self, df, options):
        # Make a copy of data, to not alter original dataframe
        X = df.copy()

        # Make sure to turn off get_dummies
        X, nans, _ = df_util.prepare_features(
            X=X,
            variables=self.feature_variables,
            final_columns=self.columns,
            get_dummies=False,
            mlspl_limits=options.get('mlspl_limits'),
        )

        X = X.values.ravel().astype('str')
        y_hat = self.estimator.transform(X)

        # Convert the returned sparse matrix into array
        y_hat = y_hat.toarray()

        output_names = self.make_output_names(options)

        output = df_util.create_output_dataframe(
            y_hat=y_hat, output_names=output_names, nans=nans
        )

        df = df_util.merge_predictions(df, output)
        return df

    @staticmethod
    def register_codecs():
        from codec.codecs import SimpleObjectCodec, SparseMatrixCodec

        codecs_manager.add_codec('algos.TFIDF', 'TFIDF', SimpleObjectCodec)
        codecs_manager.add_codec(
            'sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec
        )
        codecs_manager.add_codec(
            'sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec
        )
        codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec)
        codecs_manager.add_codec('scipy.sparse._csr', 'csr_matrix', SparseMatrixCodec)