You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/algos/HashingVectorizer.py

106 lines
3.7 KiB

#!/usr/bin/env python
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer as _HashingVectorizer
import cexc
from base import BaseAlgo
from util import df_util
from util.param_util import convert_params
messages = cexc.get_messages_logger()
class HashingVectorizer(BaseAlgo):
def handle_options(self, options):
if (
len(options.get('feature_variables', [])) != 1
or len(options.get('target_variable', [])) > 0
):
raise RuntimeError('Syntax error: You must specify exactly one field')
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
ints=['max_features', 'random_state', 'n_iters', 'k'],
strs=['stop_words', 'analyzer', 'norm', 'token_pattern'],
ranges=['ngram_range'],
bools=['reduce'],
aliases={'max_features': 'n_features', 'k': 'n_components'},
)
if 'k' in out_params and 'reduce' in out_params and not out_params['reduce']:
messages.warn('k parameter is ignored when reduce is set to false.')
# Separate the SVD parameters
svd_params = {}
for opt in ['random_state', 'n_iters', 'n_components']:
if opt in out_params:
svd_params[opt] = out_params.pop(opt)
self.do_reduce = out_params.pop('reduce', True)
out_params.setdefault('n_features', 10000)
self.estimator = _HashingVectorizer(**out_params)
svd_params.setdefault('n_components', 100)
# Check for invalid k
n_components = svd_params['n_components']
n_features = out_params['n_features']
if self.do_reduce and n_components >= n_features:
msg = 'the number of reduced fields (k={}) must be less than the number of features (max_features={})'
raise RuntimeError(msg.format(n_components, n_features))
self.reducer = TruncatedSVD(**svd_params)
self.columns = []
def fit(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Make sure to turn off get_dummies
X, nans, self.columns = df_util.prepare_features(
X=X,
variables=self.feature_variables,
get_dummies=False,
mlspl_limits=options.get('mlspl_limits'),
)
# If X is less than the reduction dimension, we can only reduce to that at max
length = len(X)
if length < self.reducer.n_components and self.do_reduce:
msg = 'Number of valid events ({}) is less than k ({}). Setting k={}.'
messages.warn(msg.format(length, self.reducer.n_components, length))
self.reducer.n_components = length
X = X.values.ravel().astype('str')
X = self.estimator.fit_transform(X)
if self.do_reduce:
y_hat = self.reducer.fit_transform(X)
else:
y_hat = X.toarray()
output_names = self.make_output_names(options)
output = df_util.create_output_dataframe(
y_hat=y_hat, output_names=output_names, nans=nans
)
df = df_util.merge_predictions(df, output)
return df
def make_output_names(self, options):
default_name = self.feature_variables[0] + '_hashed'
if self.do_reduce:
number_of_fields = self.reducer.n_components
else:
number_of_fields = self.estimator.n_features
output_name = options.get('output_name', default_name)
output_names = [output_name + '_' + str(i) for i in range(number_of_fields)]
return output_names