You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

125 lines
4.3 KiB

#!/usr/bin/env python
from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer
from base import BaseAlgo
from codec import codecs_manager
from util import df_util
from util.param_util import convert_params
class TFIDF(BaseAlgo):
def handle_options(self, options):
if (
len(options.get('feature_variables', [])) != 1
or len(options.get('target_variable', [])) > 0
):
raise RuntimeError('Syntax error: You must specify exactly one field')
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
ints=['max_features'],
strs=['max_df', 'min_df', 'stop_words', 'analyzer', 'norm', 'token_pattern'],
ranges=['ngram_range'],
)
for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]:
if doc_freq in out_params:
# EAFP... convert max_df/min_df to float/int if it is a number.
try:
float_val = float(out_params[doc_freq])
int_val = int(float_val)
except ValueError:
raise RuntimeError(
'Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(
doc_freq=doc_freq
)
)
if float_val == 1.0:
out_params[doc_freq] = default_val
elif float_val == int_val:
out_params[doc_freq] = int_val
else:
out_params[doc_freq] = float_val
if 'ngram_range' in list(out_params.keys()):
if len(out_params['ngram_range']) != 2:
raise RuntimeError(
'Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5'
)
# TODO: Maybe let the user know that we make this change.
out_params.setdefault('max_features', 100)
self.estimator = _TfidfVectorizer(**out_params)
def fit(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Make sure to turn off get_dummies
X, _, self.columns = df_util.prepare_features(
X=X,
variables=self.feature_variables,
get_dummies=False,
mlspl_limits=options.get('mlspl_limits'),
)
X = X.values.ravel().astype('str')
self.estimator.fit(X)
def make_output_names(self, options):
default_name = self.feature_variables[0] + '_tfidf'
output_name = options.get('output_name', default_name)
feature_names = self.estimator.get_feature_names_out()
output_names = [
output_name + '_' + str(index) + '_' + word
for (index, word) in enumerate(feature_names)
]
return output_names
def apply(self, df, options):
# Make a copy of data, to not alter original dataframe
X = df.copy()
# Make sure to turn off get_dummies
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables,
final_columns=self.columns,
get_dummies=False,
mlspl_limits=options.get('mlspl_limits'),
)
X = X.values.ravel().astype('str')
y_hat = self.estimator.transform(X)
# Convert the returned sparse matrix into array
y_hat = y_hat.toarray()
output_names = self.make_output_names(options)
output = df_util.create_output_dataframe(
y_hat=y_hat, output_names=output_names, nans=nans
)
df = df_util.merge_predictions(df, output)
return df
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec, SparseMatrixCodec
codecs_manager.add_codec('algos.TFIDF', 'TFIDF', SimpleObjectCodec)
codecs_manager.add_codec(
'sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec
)
codecs_manager.add_codec(
'sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec
)
codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec)
codecs_manager.add_codec('scipy.sparse._csr', 'csr_matrix', SparseMatrixCodec)