parent
f3a8e40cfc
commit
56995569d1
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@ -0,0 +1 @@
|
||||
This is where you put any scripts you want to add to this app.
|
||||
@ -0,0 +1,92 @@
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering as AgClustering
|
||||
from sklearn.metrics import silhouette_samples
|
||||
|
||||
from base import BaseAlgo
|
||||
from util.param_util import convert_params
|
||||
from util import df_util
|
||||
|
||||
|
||||
class AgglomerativeClustering(BaseAlgo):
|
||||
"""Use scikit-learn's AgglomerativeClustering algorithm to cluster data."""
|
||||
|
||||
def __init__(self, options):
|
||||
|
||||
feature_variables = options.get('feature_variables', {})
|
||||
target_variable = options.get('target_variable', {})
|
||||
|
||||
# Ensure fields are present
|
||||
if len(feature_variables) == 0:
|
||||
raise RuntimeError('You must supply one or more fields')
|
||||
|
||||
# No from clause allowed
|
||||
if len(target_variable) > 0:
|
||||
raise RuntimeError('AgglomerativeClustering does not support the from clause')
|
||||
|
||||
# Convert params & alias k to n_clusters
|
||||
params = options.get('params', {})
|
||||
out_params = convert_params(
|
||||
params,
|
||||
ints=['k'],
|
||||
strs=['linkage', 'affinity'],
|
||||
aliases={'k': 'n_clusters'}
|
||||
)
|
||||
|
||||
# Check for valid linkage
|
||||
if 'linkage' in out_params:
|
||||
valid_linkage = ['ward', 'complete', 'average']
|
||||
if out_params['linkage'] not in valid_linkage:
|
||||
raise RuntimeError('linkage must be one of: {}'.format(', '.join(valid_linkage)))
|
||||
|
||||
# Check for valid affinity
|
||||
if 'affinity' in out_params:
|
||||
valid_affinity = ['l1', 'l2', 'cosine', 'manhattan',
|
||||
'precomputed', 'euclidean']
|
||||
|
||||
if out_params['affinity'] not in valid_affinity:
|
||||
raise RuntimeError('affinity must be one of: {}'.format(', '.join(valid_affinity)))
|
||||
|
||||
# Check for invalid affinity & linkage combination
|
||||
if 'linkage' in out_params and 'affinity' in out_params:
|
||||
if out_params['linkage'] == 'ward':
|
||||
if out_params['affinity'] != 'euclidean':
|
||||
raise RuntimeError('ward linkage (default) must use euclidean affinity (default)')
|
||||
|
||||
# Initialize the estimator
|
||||
self.estimator = AgClustering(**out_params)
|
||||
|
||||
def fit(self, df, options):
|
||||
"""Do the clustering & merge labels with original data."""
|
||||
# Make a copy of the input data
|
||||
X = df.copy()
|
||||
|
||||
# Use the df_util prepare_features method to
|
||||
# - drop null columns & rows
|
||||
# - convert categorical columns into dummy indicator columns
|
||||
# X is our cleaned data, nans is a mask of the null value locations
|
||||
X, nans, columns = df_util.prepare_features(X, self.feature_variables)
|
||||
|
||||
# Do the actual clustering
|
||||
y_hat = self.estimator.fit_predict(X.values)
|
||||
|
||||
# attach silhouette coefficient score for each row
|
||||
silhouettes = silhouette_samples(X, y_hat)
|
||||
|
||||
# Combine the two arrays, and transpose them.
|
||||
y_hat = np.vstack([y_hat, silhouettes]).T
|
||||
|
||||
# Assign default output names
|
||||
default_name = 'cluster'
|
||||
|
||||
# Get the value from the as-clause if present
|
||||
output_name = options.get('output_name', default_name)
|
||||
|
||||
# There are two columns - one for the labels, for the silhouette scores
|
||||
output_names = [output_name, 'silhouette_score']
|
||||
|
||||
# Use the predictions & nans-mask to create a new dataframe
|
||||
output_df = df_util.create_output_dataframe(y_hat, nans, output_names)
|
||||
|
||||
# Merge the dataframe with the original input data
|
||||
df = df_util.merge_predictions(df, output_df)
|
||||
return df
|
||||
@ -0,0 +1,109 @@
|
||||
|
||||
from base import BaseAlgo
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from cexc import get_logger
|
||||
from util import df_util
|
||||
from util.param_util import convert_params
|
||||
|
||||
# Everyone's favorite in memory collaborative filter, not a scaleable solution for millions of users and millions of items
|
||||
# https://en.wikipedia.org/wiki/Collaborative_filtering
|
||||
# please check out more scaleable solutions in KNN or "Recommender Systems: The Textbook"
|
||||
# TODO add coldstart solution for nulls
|
||||
# TODO currently we assume a |fillnull value=0 is run in splunk prior to calling the algorithm
|
||||
|
||||
# We ASSUME rows are users, columns are items.
|
||||
# TODO I seem to cause splunk memory issues with wide tables, so I should consider doing an XYSERIES like reshape
|
||||
# TODO and consider taking in a table of USERID, ITEM , RATING from splunk. Yucky.
|
||||
|
||||
# TODO There are many many many other distance metrics that could be a good fit.
|
||||
|
||||
|
||||
class CollaborativeFilter(BaseAlgo):
|
||||
def __init__(self, options):
|
||||
|
||||
|
||||
# set parameters
|
||||
params = options.get('params', {})
|
||||
out_params = convert_params(
|
||||
params,
|
||||
strs=['user_field','rating_type','coldstart_field']
|
||||
)
|
||||
|
||||
# set defaults for parameters
|
||||
if 'user_field' in out_params:
|
||||
self.user_field = out_params['user_field']
|
||||
else:
|
||||
self.user_field = "SME"
|
||||
|
||||
self.rating_type="item"
|
||||
if 'rating_type' in out_params:
|
||||
if out_params['rating_type'] == "item":
|
||||
self.rating_type="item"
|
||||
elif out_params['rating_type'] == "user":
|
||||
self.rating_type="user"
|
||||
|
||||
|
||||
def fit(self, df, options):
|
||||
# df contains all the search results, including hidden fields
|
||||
# but the requested requested are saved as self.feature_variables
|
||||
logger = get_logger('MyCustomLogging')
|
||||
|
||||
X=df.copy()
|
||||
|
||||
# it is always best practice to prepare your data.
|
||||
# splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only
|
||||
# want the features that are valid field names.
|
||||
|
||||
|
||||
#Make sure to turn off get_dummies
|
||||
X, _, self.columns = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
get_dummies=False,
|
||||
mlspl_limits=options.get('mlspl_limits'),
|
||||
)
|
||||
|
||||
# test if user field is in the list
|
||||
logger.debug("The user field is %s",self.user_field )
|
||||
try:
|
||||
my_list_index=(X[self.user_field].values)
|
||||
except:
|
||||
raise RuntimeError('You must specify user field that exists. You sent %s',self.user_field)
|
||||
|
||||
X=X.drop([self.user_field],axis=1)
|
||||
my_list_header=(X.columns.values)
|
||||
|
||||
#ratings as a matrix , clean that data up!
|
||||
X=X.replace([np.inf, -np.inf], "nan").replace("nan","0")
|
||||
matrix=X.values
|
||||
# force type for Numpy Math
|
||||
matrix=matrix.astype(np.float64)
|
||||
|
||||
# should consider erroring out when you have super sparse user data
|
||||
# TODO add other methods via parameter
|
||||
user_sim = pairwise_distances(matrix, metric='cosine')
|
||||
item_sim = pairwise_distances(matrix.T, metric='cosine')
|
||||
|
||||
#item prediction
|
||||
item_sim= matrix.dot(item_sim) / np.array([np.abs(item_sim).sum(axis=1)])
|
||||
|
||||
#user sim
|
||||
mean_user_rating = matrix.mean(axis=1)
|
||||
matrix_diff = (matrix - mean_user_rating[:, np.newaxis])
|
||||
user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T
|
||||
|
||||
# add back into the matrix the header row
|
||||
if self.rating_type == "item":
|
||||
output_df=pd.DataFrame(item_sim,columns=my_list_header, index=my_list_index)
|
||||
if self.rating_type == "user":
|
||||
output_df=pd.DataFrame(user_sim,columns=my_list_header, index=my_list_index)
|
||||
output_df[self.user_field]=pd.Series(my_list_index).values
|
||||
|
||||
return output_df
|
||||
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,57 @@
|
||||
from base import BaseAlgo
|
||||
|
||||
|
||||
class CorrelationMatrix(BaseAlgo):
|
||||
"""Compute and return a correlation matrix."""
|
||||
|
||||
def __init__(self, options):
|
||||
"""Check for valid correlation type, and save it to an attribute on self."""
|
||||
|
||||
feature_variables = options.get('feature_variables', {})
|
||||
target_variable = options.get('target_variable', {})
|
||||
|
||||
if len(feature_variables) == 0:
|
||||
raise RuntimeError('You must supply one or more fields')
|
||||
|
||||
if len(target_variable) > 0:
|
||||
raise RuntimeError('CorrelationMatrix does not support the from clause')
|
||||
|
||||
valid_methods = ['spearman', 'kendall', 'pearson']
|
||||
|
||||
# Check to see if parameters exist
|
||||
params = options.get('params', {})
|
||||
|
||||
# Check if method is in parameters in search
|
||||
if 'method' in params:
|
||||
if params['method'] not in valid_methods:
|
||||
error_msg = 'Invalid value for method: must be one of {}'.format(
|
||||
', '.join(valid_methods))
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
# Assign method to self for later usage
|
||||
self.method = params['method']
|
||||
|
||||
# Assign default method and ensure no other parameters are present
|
||||
else:
|
||||
# Default method for correlation
|
||||
self.method = 'pearson'
|
||||
|
||||
# Check for bad parameters
|
||||
if len(params) > 0:
|
||||
raise RuntimeError('The only valid parameter is method.')
|
||||
|
||||
def fit(self, df, options):
|
||||
"""Compute the correlations and return a DataFrame."""
|
||||
|
||||
# df contains all the search results, including hidden fields
|
||||
# but the requested requested are saved as self.feature_variables
|
||||
requested_columns = df[self.feature_variables]
|
||||
|
||||
# Get correlations
|
||||
correlations = requested_columns.corr(method=self.method)
|
||||
|
||||
# Reset index so that all the data are in columns
|
||||
# (this is necessary for the corr method)
|
||||
output_df = correlations.reset_index()
|
||||
|
||||
return output_df
|
||||
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier as _DecisionTreeClassifier
|
||||
from base import ClassifierMixin, BaseAlgo
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
from util.algo_util import tree_summary
|
||||
|
||||
#This algorithm is an updated version of DecisionTreecClassifier from MLTK and class weight parameter has been added to it
|
||||
|
||||
class CustomDecisionTreeClassifier(ClassifierMixin, BaseAlgo):
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
ints=['random_state', 'max_depth', 'min_samples_split', 'max_leaf_nodes'],
|
||||
strs=['criterion', 'splitter', 'max_features', 'class_weight'],
|
||||
)
|
||||
|
||||
# whitelist valid values for criterion, as error raised by sklearn for invalid values is uninformative
|
||||
if 'criterion' in out_params:
|
||||
try:
|
||||
assert (out_params['criterion'] in ['gini', 'entropy'])
|
||||
except AssertionError:
|
||||
raise RuntimeError('Invalid value for option criterion: "%s"' % out_params['criterion'])
|
||||
|
||||
# whitelist valid values for splitter, as error raised by sklearn for invalid values is uninformative
|
||||
if 'splitter' in out_params:
|
||||
try:
|
||||
assert (out_params['splitter'] in ['best', 'random'])
|
||||
except AssertionError:
|
||||
raise RuntimeError('Invalid value for option splitter: "%s"' % out_params['splitter'])
|
||||
|
||||
if 'max_depth' not in out_params:
|
||||
out_params.setdefault('max_leaf_nodes', 2000)
|
||||
|
||||
# EAFP... convert max_features to int or float if it is a number.
|
||||
try:
|
||||
out_params['max_features'] = float(out_params['max_features'])
|
||||
max_features_int = int(out_params['max_features'])
|
||||
if out_params['max_features'] == max_features_int:
|
||||
out_params['max_features'] = max_features_int
|
||||
except:
|
||||
pass
|
||||
|
||||
if 'class_weight' in out_params:
|
||||
try:
|
||||
from ast import literal_eval
|
||||
out_params['class_weight'] = literal_eval(out_params['class_weight'])
|
||||
except Exception:
|
||||
raise RuntimeError('Invalid value for option class_weight: "%s"' % out_params['class_weight'])
|
||||
|
||||
self.estimator = _DecisionTreeClassifier(**out_params)
|
||||
|
||||
def summary(self, options):
|
||||
if 'args' in options:
|
||||
raise RuntimeError('Summarization does not take values other than parameters')
|
||||
return tree_summary(self, options)
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec, TreeCodec
|
||||
codecs_manager.add_codec('algos_contrib.CustomDecisionTreeClassifier', 'CustomDecisionTreeClassifier', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeClassifier', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
|
||||
@ -0,0 +1,9 @@
|
||||
from base import BaseAlgo
|
||||
|
||||
|
||||
class ExampleAlgo(BaseAlgo):
|
||||
def __init__(self, options):
|
||||
pass
|
||||
|
||||
def fit(self, df, options):
|
||||
return df
|
||||
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from pandas import DataFrame
|
||||
from sklearn.ensemble import ExtraTreesClassifier as _ExtraTreesClassifier
|
||||
|
||||
from base import ClassifierMixin, BaseAlgo
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
from util.algo_util import handle_max_features
|
||||
|
||||
|
||||
class ExtraTreesClassifier(ClassifierMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
ints=['random_state', 'n_estimators', 'max_depth',
|
||||
'min_samples_split', 'max_leaf_nodes'],
|
||||
strs=['max_features', 'criterion'],
|
||||
)
|
||||
|
||||
if 'max_depth' not in out_params:
|
||||
out_params.setdefault('max_leaf_nodes', 2000)
|
||||
|
||||
if 'max_features' in out_params:
|
||||
out_params['max_features'] = handle_max_features(out_params['max_features'])
|
||||
|
||||
self.estimator = _ExtraTreesClassifier(class_weight='balanced',
|
||||
**out_params)
|
||||
|
||||
def summary(self, options):
|
||||
if len(options) != 2: # only model name and mlspl_limits
|
||||
raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
|
||||
df = DataFrame({
|
||||
'feature': self.columns,
|
||||
'importance': self.estimator.feature_importances_.ravel()
|
||||
})
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec, TreeCodec
|
||||
codecs_manager.add_codec('algos_contrib.ExtraTreesClassifier',
|
||||
'ExtraTreesClassifier', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.ensemble.forest',
|
||||
'ExtraTreesClassifier', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeClassifier',
|
||||
SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
|
||||
@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from sklearn.ensemble import IsolationForest as _IsolationForest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from base import ClustererMixin, BaseAlgo
|
||||
from codec import codecs_manager
|
||||
from codec.codecs import BaseCodec
|
||||
from codec.flatten import flatten, expand
|
||||
from util import df_util
|
||||
from util.param_util import convert_params
|
||||
from cexc import get_messages_logger,get_logger
|
||||
|
||||
class IsolationForest(ClustererMixin, BaseAlgo):
|
||||
"""
|
||||
This is the implementation wrapper around Isolation Forest from scikit-learn. It inherits methods from ClustererMixin and BaseAlgo.
|
||||
"""
|
||||
def __init__(self,options):
|
||||
self.handle_options(options)
|
||||
out_params = convert_params(
|
||||
options.get('params',{}),
|
||||
ints = ['n_estimators','n_jobs','random_state','verbose'],
|
||||
floats = ['max_samples','contamination','max_features'],
|
||||
bools = ['bootstrap']
|
||||
)
|
||||
self.return_scores = out_params.pop('anomaly_score', True)
|
||||
|
||||
# whitelist n_estimators > 0
|
||||
if 'n_estimators' in out_params and out_params['n_estimators']<=0:
|
||||
msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".'
|
||||
raise RuntimeError(msg.format(out_params['n_estimators']))
|
||||
|
||||
# whitelist max_samples > 0 and < 1
|
||||
if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1:
|
||||
msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".'
|
||||
raise RuntimeError(msg.format(out_params['max_samples']))
|
||||
|
||||
# whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
|
||||
if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
|
||||
msg = (
|
||||
'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
|
||||
'but found contamination="{}".'
|
||||
)
|
||||
raise RuntimeError(msg.format(out_params['contamination']))
|
||||
|
||||
# whitelist max_features > 0 and < 1
|
||||
if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1:
|
||||
msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".'
|
||||
raise RuntimeError(msg.format(out_params['max_features']))
|
||||
|
||||
|
||||
self.estimator = _IsolationForest(**out_params)
|
||||
|
||||
|
||||
def apply(self, df, options):
|
||||
# Make a copy of data, to not alter original dataframe
|
||||
logger = get_logger('IsolationForest Logger')
|
||||
X = df.copy()
|
||||
|
||||
X, nans, _ = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
final_columns=self.columns,
|
||||
mlspl_limits=options.get('mlspl_limits'),
|
||||
)
|
||||
|
||||
# Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1.
|
||||
y_hat = self.estimator.predict(X.values)*-1
|
||||
# Printing the accuracy for prediction of outliers
|
||||
accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2)))
|
||||
logger.debug(accuracy)
|
||||
|
||||
y_hat = y_hat.astype('str')
|
||||
|
||||
#Assign output_name
|
||||
default_name = 'isOutlier'
|
||||
new_name = options.get('output_name', None)
|
||||
output_name = self.rename_output(default_names=default_name, new_names=new_name)
|
||||
|
||||
# Create output dataframe
|
||||
output = df_util.create_output_dataframe(
|
||||
y_hat=y_hat, nans=nans, output_names=output_name
|
||||
)
|
||||
# Merge with original dataframe
|
||||
output = df_util.merge_predictions(df, output)
|
||||
return output
|
||||
|
||||
def rename_output(self, default_names, new_names=None):
|
||||
"""Utility hook to rename output.
|
||||
|
||||
The default behavior is to take the default_names passed in and simply
|
||||
return them. If however a particular algo needs to rename the columns of
|
||||
the output, this method can be overridden.
|
||||
"""
|
||||
return new_names if new_names is not None else default_names
|
||||
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec, TreeCodec
|
||||
codecs_manager.add_codec('algos.IsolationForest', 'IsolationForest', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.ensemble.iforest', 'IsolationForest', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.tree.tree','ExtraTreeRegressor', ExtraTreeRegressorCodec)
|
||||
codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
|
||||
|
||||
|
||||
class ExtraTreeRegressorCodec(BaseCodec):
|
||||
"""
|
||||
This is an ExtraTreeRegressor Codec for saving the Isolation Forest base estimator to memory/file.
|
||||
"""
|
||||
@classmethod
|
||||
def encode(cls, obj):
|
||||
import sklearn.tree
|
||||
assert type(obj) == sklearn.tree.tree.ExtraTreeRegressor
|
||||
state = obj.__getstate__()
|
||||
return {
|
||||
'__mlspl_type': [type(obj).__module__, type(obj).__name__],
|
||||
'state': state
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def decode(cls,obj):
|
||||
from sklearn.tree.tree import ExtraTreeRegressor
|
||||
state = obj['state']
|
||||
t = ExtraTreeRegressor.__new__(ExtraTreeRegressor)
|
||||
t.__setstate__(state)
|
||||
return t
|
||||
@ -0,0 +1,35 @@
|
||||
'''
|
||||
Once newer version of sklearn is used will need to change k alias from n_topics to n_components
|
||||
https://stackoverflow.com/a/48121678
|
||||
'''
|
||||
|
||||
from sklearn.decomposition import LatentDirichletAllocation as _LatentDirichletAllocation
|
||||
from base import BaseAlgo, TransformerMixin
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
|
||||
class LatentDirichletAllocation(TransformerMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'],
|
||||
strs=['learning_method'],
|
||||
ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'],
|
||||
aliases={'k': 'n_topics'}
|
||||
)
|
||||
|
||||
self.estimator = _LatentDirichletAllocation(**out_params)
|
||||
|
||||
def rename_output(self, default_names, new_names):
|
||||
if new_names is None:
|
||||
new_names = 'LDA'
|
||||
output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
|
||||
return output_names
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.LatentDirichletAllocation', 'LatentDirichletAllocation', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.decomposition.online_lda', 'LatentDirichletAllocation', SimpleObjectCodec)
|
||||
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from sklearn.svm import LinearSVC as _LinearSVC
|
||||
|
||||
from codec import codecs_manager
|
||||
from base import BaseAlgo, ClassifierMixin
|
||||
from util.param_util import convert_params
|
||||
|
||||
|
||||
class LinearSVC(ClassifierMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
floats=['gamma', 'C', 'tol', 'intercept_scaling'],
|
||||
ints=['random_state','max_iter'],
|
||||
strs=['penalty', 'loss', 'multi_class'],
|
||||
bools=['dual', 'fit_intercept'],
|
||||
)
|
||||
|
||||
self.estimator = _LinearSVC(**out_params)
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.LinearSVC', 'LinearSVC', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.svm.classes', 'LinearSVC', SimpleObjectCodec)
|
||||
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from sklearn.manifold import MDS as _MDS
|
||||
|
||||
from base import BaseAlgo, TransformerMixin
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
|
||||
from util import df_util
|
||||
|
||||
class MDS(TransformerMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
ints=['k', 'max_iter', 'n_init', 'n_jobs'],
|
||||
floats=['eps'],
|
||||
bools=['metric'],
|
||||
aliases={'k': 'n_components'}
|
||||
)
|
||||
|
||||
if 'max_iter' not in out_params:
|
||||
out_params.setdefault('max_iter', 300)
|
||||
|
||||
if 'n_init' not in out_params:
|
||||
out_params.setdefault('n_init', 4)
|
||||
|
||||
if 'n_jobs' not in out_params:
|
||||
out_params.setdefault('n_jobs', 1)
|
||||
|
||||
if 'eps' not in out_params:
|
||||
out_params.setdefault('eps', 0.001)
|
||||
|
||||
if 'metric' not in out_params:
|
||||
out_params.setdefault('metric', True)
|
||||
|
||||
self.estimator = _MDS(**out_params)
|
||||
|
||||
def rename_output(self, default_names, new_names):
|
||||
if new_names is None:
|
||||
new_names = 'MDS'
|
||||
output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
|
||||
return output_names
|
||||
|
||||
def apply(self, df, options):
|
||||
# Make a copy of data, to not alter original dataframe
|
||||
X = df.copy()
|
||||
|
||||
# Prepare the features
|
||||
X, nans, _ = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
final_columns=self.columns,
|
||||
)
|
||||
|
||||
# Call the transform method
|
||||
y_hat = self.estimator.fit_transform(X.values)
|
||||
|
||||
# Assign output_name
|
||||
output_name = options.get('output_name', None)
|
||||
default_names = self.make_output_names(
|
||||
output_name=output_name,
|
||||
n_names=y_hat.shape[1],
|
||||
)
|
||||
output_names = self.rename_output(default_names, output_name)
|
||||
|
||||
# Create output dataframe
|
||||
output = df_util.create_output_dataframe(
|
||||
y_hat=y_hat,
|
||||
nans=nans,
|
||||
output_names=output_names,
|
||||
)
|
||||
|
||||
# Merge with original dataframe
|
||||
output = df_util.merge_predictions(df, output)
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.MDS', 'MDS', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.manifold.MDS', 'MDS', SimpleObjectCodec)
|
||||
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import MinMaxScaler as _MinMaxScaler
|
||||
|
||||
from base import BaseAlgo, TransformerMixin
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
from util import df_util
|
||||
|
||||
|
||||
class MinMaxScaler(TransformerMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
bools=['copy'],
|
||||
strs=['feature_range']
|
||||
)
|
||||
self.estimator = _MinMaxScaler(**out_params)
|
||||
self.columns = None
|
||||
|
||||
def rename_output(self, default_names, new_names=None):
|
||||
if new_names is None:
|
||||
new_names = 'MMS'
|
||||
output_names = [new_names + '_' + feature for feature in self.columns]
|
||||
return output_names
|
||||
|
||||
def partial_fit(self, df, options):
|
||||
# Make a copy of data, to not alter original dataframe
|
||||
X = df.copy()
|
||||
|
||||
X, _, columns = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
mlspl_limits=options.get('mlspl_limits'),
|
||||
)
|
||||
if self.columns is not None:
|
||||
df_util.handle_new_categorical_values(X, None, options, self.columns)
|
||||
if X.empty:
|
||||
return
|
||||
else:
|
||||
self.columns = columns
|
||||
self.estimator.partial_fit(X)
|
||||
|
||||
def summary(self, options):
|
||||
if len(options) != 2: # only model name and mlspl_limits
|
||||
raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
|
||||
return pd.DataFrame({'fields': self.columns,
|
||||
'mean': self.estimator.mean_,
|
||||
'var': self.estimator.var_,
|
||||
'scale': self.estimator.scale_})
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.MinMaxScaler', 'MinMaxScaler', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.preprocessing.data', 'MinMaxScaler', SimpleObjectCodec)
|
||||
@ -0,0 +1,31 @@
|
||||
from sklearn.decomposition import NMF as _NMF
|
||||
from base import BaseAlgo, TransformerMixin
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
|
||||
class NMF(TransformerMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
floats=['beta_loss','tol','alpha','l1_ratio'],
|
||||
strs=['init','solver'],
|
||||
ints=['k','max_iter','random_state'],
|
||||
bools=['versbose','shuffle'],
|
||||
aliases={'k': 'n_components'}
|
||||
)
|
||||
|
||||
self.estimator = _NMF(**out_params)
|
||||
|
||||
def rename_output(self, default_names, new_names):
|
||||
if new_names is None:
|
||||
new_names = 'NMF'
|
||||
output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
|
||||
return output_names
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.NMF', 'NMF', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.decomposition.nmf', 'NMF', SimpleObjectCodec)
|
||||
@ -0,0 +1,38 @@
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import OrthogonalMatchingPursuit as _OrthogonalMatchingPursuit
|
||||
from base import RegressorMixin, BaseAlgo
|
||||
from util.param_util import convert_params
|
||||
from util import df_util
|
||||
|
||||
|
||||
class OrthogonalMatchingPursuit(RegressorMixin, BaseAlgo):
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
params = options.get('params', {})
|
||||
out_params = convert_params(
|
||||
params,
|
||||
floats=['tol'],
|
||||
strs=['kernel'],
|
||||
ints=['n_nonzero_coefs'],
|
||||
bools=['fit_intercept', 'normalize'],
|
||||
)
|
||||
|
||||
self.estimator = _OrthogonalMatchingPursuit(**out_params)
|
||||
|
||||
def summary(self, options):
|
||||
if len(options) != 2: # only model name and mlspl_limits
|
||||
raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
|
||||
df = pd.DataFrame({'feature': self.columns,
|
||||
'coefficient': self.estimator.coef_.ravel()})
|
||||
idf = pd.DataFrame({'feature': ['_intercept'],
|
||||
'coefficient': [self.estimator.intercept_]})
|
||||
return pd.concat([df, idf])
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
from codec import codecs_manager
|
||||
codecs_manager.add_codec('algos_contrib.OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuit', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.linear_model.omp', 'OrthogonalMatchingPursuit', SimpleObjectCodec)
|
||||
|
||||
@ -0,0 +1,27 @@
|
||||
from sklearn.svm import SVR as _SVR
|
||||
|
||||
from base import BaseAlgo, RegressorMixin
|
||||
from util.param_util import convert_params
|
||||
|
||||
|
||||
class SVR(RegressorMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
params = options.get('params', {})
|
||||
out_params = convert_params(
|
||||
params,
|
||||
floats=['C', 'gamma'],
|
||||
strs=['kernel'],
|
||||
ints=['degree'],
|
||||
)
|
||||
|
||||
self.estimator = _SVR(**out_params)
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
from codec import codecs_manager
|
||||
codecs_manager.add_codec('algos_contrib.SVR', 'SVR', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.svm.classes', 'SVR', SimpleObjectCodec)
|
||||
@ -0,0 +1,48 @@
|
||||
import numpy as np
|
||||
from scipy.signal import savgol_filter
|
||||
|
||||
from base import BaseAlgo
|
||||
from util.param_util import convert_params
|
||||
from util import df_util
|
||||
|
||||
|
||||
class SavgolFilter(BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
# set parameters
|
||||
params = options.get('params', {})
|
||||
out_params = convert_params(
|
||||
params,
|
||||
ints=['window_length', 'polyorder', 'deriv']
|
||||
)
|
||||
|
||||
# set defaults for parameters
|
||||
if 'window_length' in out_params:
|
||||
self.window_length = out_params['window_length']
|
||||
else:
|
||||
self.window_length = 5
|
||||
|
||||
if 'polyorder' in out_params:
|
||||
self.polyorder = out_params['polyorder']
|
||||
else:
|
||||
self.polyorder = 2
|
||||
|
||||
if 'deriv' in out_params:
|
||||
self.deriv = out_params['deriv']
|
||||
else:
|
||||
self.deriv = 0
|
||||
|
||||
def fit(self, df, options):
|
||||
X = df.copy()
|
||||
X, nans, columns = df_util.prepare_features(X, self.feature_variables)
|
||||
|
||||
def f(x):
|
||||
return savgol_filter(x, self.window_length, self.polyorder, self.deriv)
|
||||
|
||||
y_hat = np.apply_along_axis(f, 0, X)
|
||||
|
||||
names = ['SG_%s' % col for col in columns]
|
||||
output_df = df_util.create_output_dataframe(y_hat, nans, names)
|
||||
df = df_util.merge_predictions(df, output_df)
|
||||
|
||||
return df
|
||||
@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
Copy of existing TFIDF algo but with 2 boolean options added and 3 options set
|
||||
so that binary output is achieved.
|
||||
'''
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer
|
||||
|
||||
from base import BaseAlgo
|
||||
from codec import codecs_manager
|
||||
from util import df_util
|
||||
from util.param_util import convert_params
|
||||
|
||||
|
||||
class TFBinary(BaseAlgo):
|
||||
|
||||
def handle_options(self, options):
|
||||
if len(options.get('feature_variables', [])) != 1 or len(options.get('target_variable', [])) > 0:
|
||||
raise RuntimeError('Syntax error: You must specify exactly one field')
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
ints=['max_features'],
|
||||
bools=['use_idf','binary'],
|
||||
strs=['max_df', 'min_df',
|
||||
'ngram_range', 'stop_words',
|
||||
'analyzer', 'norm', 'token_pattern'],
|
||||
)
|
||||
|
||||
for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]:
|
||||
if doc_freq in out_params:
|
||||
# EAFP... convert max_df/min_df to float/int if it is a number.
|
||||
try:
|
||||
float_val = float(out_params[doc_freq])
|
||||
int_val = int(float_val)
|
||||
except ValueError:
|
||||
raise RuntimeError('Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(doc_freq=doc_freq))
|
||||
if float_val == 1.0:
|
||||
out_params[doc_freq] = default_val
|
||||
elif float_val == int_val:
|
||||
out_params[doc_freq] = int_val
|
||||
else:
|
||||
out_params[doc_freq] = float_val
|
||||
|
||||
if 'ngram_range' in out_params.keys():
|
||||
try:
|
||||
out_params['ngram_range'] = tuple(int(i) for i in out_params['ngram_range'].split('-'))
|
||||
assert len(out_params['ngram_range']) == 2
|
||||
except:
|
||||
raise RuntimeError('Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5')
|
||||
|
||||
# TODO: Maybe let the user know that we make this change.
|
||||
out_params.setdefault('max_features', 100)
|
||||
|
||||
# Binary defaults
|
||||
out_params.setdefault('use_idf', False)
|
||||
out_params.setdefault('norm', None)
|
||||
out_params.setdefault('binary', True)
|
||||
|
||||
self.estimator = _TfidfVectorizer(**out_params)
|
||||
|
||||
def fit(self, df, options):
|
||||
# Make a copy of data, to not alter original dataframe
|
||||
X = df.copy()
|
||||
|
||||
# Make sure to turn off get_dummies
|
||||
X, _, self.columns = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
get_dummies=False,
|
||||
mlspl_limits=options.get('mlspl_limits'),
|
||||
)
|
||||
|
||||
X = X.values.ravel().astype('str')
|
||||
self.estimator.fit(X)
|
||||
|
||||
def make_output_names(self, options):
|
||||
default_name = self.feature_variables[0] + '_tfbin'
|
||||
output_name = options.get('output_name', default_name)
|
||||
feature_names = self.estimator.get_feature_names()
|
||||
output_names = [output_name + '_' + str(index) + '_' + word
|
||||
for (index, word) in enumerate(feature_names)]
|
||||
return output_names
|
||||
|
||||
def apply(self, df, options):
|
||||
# Make a copy of data, to not alter original dataframe
|
||||
X = df.copy()
|
||||
|
||||
# Make sure to turn off get_dummies
|
||||
X, nans, _ = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
final_columns=self.columns,
|
||||
get_dummies=False,
|
||||
mlspl_limits=options.get('mlspl_limits'),
|
||||
)
|
||||
|
||||
X = X.values.ravel().astype('str')
|
||||
y_hat = self.estimator.transform(X)
|
||||
|
||||
# Convert the returned sparse matrix into array
|
||||
y_hat = y_hat.toarray()
|
||||
|
||||
output_names = self.make_output_names(options)
|
||||
|
||||
output = df_util.create_output_dataframe(
|
||||
y_hat=y_hat,
|
||||
output_names=output_names,
|
||||
nans=nans,
|
||||
)
|
||||
|
||||
df = df_util.merge_predictions(df, output)
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.TFBinary', 'TFBinary', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec)
|
||||
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from sklearn.manifold import TSNE as _TSNE
|
||||
|
||||
from base import BaseAlgo, TransformerMixin
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
|
||||
from util import df_util
|
||||
|
||||
class TSNE(TransformerMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
ints=['k', 'n_iter'],
|
||||
floats=['perplexity', 'early_exaggeration', 'learning_rate'],
|
||||
aliases={'k': 'n_components'}
|
||||
)
|
||||
|
||||
if out_params['n_components'] < 1:
|
||||
msg = 'Invalid value for k: k must be greater than or equal to 1, but found k="{}".'
|
||||
raise RuntimeError(msg.format(out_params['n_components']))
|
||||
|
||||
if 'n_iter' not in out_params:
|
||||
out_params.setdefault('n_iter', 200)
|
||||
|
||||
if 'perplexity' not in out_params:
|
||||
out_params.setdefault('perplexity', 30.0)
|
||||
|
||||
if 'early_exaggeration' not in out_params:
|
||||
out_params.setdefault('early_exaggeration', 4.0)
|
||||
|
||||
if 'learning_rate' not in out_params:
|
||||
out_params.setdefault('learning_rate', 100)
|
||||
|
||||
self.estimator = _TSNE(**out_params)
|
||||
|
||||
def rename_output(self, default_names, new_names):
|
||||
if new_names is None:
|
||||
new_names = 'TSNE'
|
||||
output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
|
||||
return output_names
|
||||
|
||||
def apply(self, df, options):
|
||||
# Make a copy of data, to not alter original dataframe
|
||||
X = df.copy()
|
||||
|
||||
# Prepare the features
|
||||
X, nans, _ = df_util.prepare_features(
|
||||
X=X,
|
||||
variables=self.feature_variables,
|
||||
final_columns=self.columns,
|
||||
)
|
||||
|
||||
# Call the transform method
|
||||
y_hat = self.estimator.fit_transform(X.values)
|
||||
|
||||
# Assign output_name
|
||||
output_name = options.get('output_name', None)
|
||||
default_names = self.make_output_names(
|
||||
output_name=output_name,
|
||||
n_names=y_hat.shape[1],
|
||||
)
|
||||
output_names = self.rename_output(default_names, output_name)
|
||||
|
||||
# Create output dataframe
|
||||
output = df_util.create_output_dataframe(
|
||||
y_hat=y_hat,
|
||||
nans=nans,
|
||||
output_names=output_names,
|
||||
)
|
||||
|
||||
# Merge with original dataframe
|
||||
output = df_util.merge_predictions(df, output)
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.TSNE', 'TSNE', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.manifold.t_sne', 'TSNE', SimpleObjectCodec)
|
||||
@ -0,0 +1,30 @@
|
||||
from sklearn.decomposition import TruncatedSVD as _TruncatedSVD
|
||||
from base import BaseAlgo, TransformerMixin
|
||||
from codec import codecs_manager
|
||||
from util.param_util import convert_params
|
||||
|
||||
class TruncatedSVD(TransformerMixin, BaseAlgo):
|
||||
|
||||
def __init__(self, options):
|
||||
self.handle_options(options)
|
||||
out_params = convert_params(
|
||||
options.get('params', {}),
|
||||
floats=['tol'],
|
||||
strs=['algorithm'],
|
||||
ints=['k','n_iter','random_state'],
|
||||
aliases={'k': 'n_components'}
|
||||
)
|
||||
|
||||
self.estimator = _TruncatedSVD(**out_params)
|
||||
|
||||
def rename_output(self, default_names, new_names):
|
||||
if new_names is None:
|
||||
new_names = 'SVD'
|
||||
output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
|
||||
return output_names
|
||||
|
||||
@staticmethod
|
||||
def register_codecs():
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
codecs_manager.add_codec('algos_contrib.TruncatedSVD', 'TruncatedSVD', SimpleObjectCodec)
|
||||
codecs_manager.add_codec('sklearn.decomposition.truncated_svd', 'TruncatedSVD', SimpleObjectCodec)
|
||||
@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.CustomDecisionTreeClassifier import CustomDecisionTreeClassifier
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'target_variable': ['a'],
|
||||
'feature_variables': ['b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(CustomDecisionTreeClassifier, required_methods , input_df, options)
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
from algos_contrib.IsolationForest import IsolationForest
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
import pandas as pd
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(IsolationForest, serializable=False)
|
||||
|
||||
def test_algo_options():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [5.1, 4.9, 4.7, 4.6],
|
||||
'b': [3.5, 3.0, 3.1, 3.2],
|
||||
'c': [1.4, 1.4, 1.5, 1.6],
|
||||
'd': [0.2, 0.2, 0.2, 0.4],
|
||||
'e': ['Iris Setosa','Iris Setosa','Iris Versicolor','Iris Virginica']
|
||||
})
|
||||
options = {
|
||||
'target_variables' : [],
|
||||
'feature_variables': ['a','b','c','d'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'apply',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(IsolationForest, required_methods=required_methods, input_df=input_df, options=options, serializable=False)
|
||||
@ -0,0 +1,6 @@
|
||||
from algos_contrib.AgglomerativeClustering import AgglomerativeClustering
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(AgglomerativeClustering, serializable=False)
|
||||
@ -0,0 +1,6 @@
|
||||
from algos_contrib.CollaborativeFilter import CollaborativeFilter
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(CollaborativeFilter, serializable=False)
|
||||
@ -0,0 +1,6 @@
|
||||
from algos_contrib.CorrelationMatrix import CorrelationMatrix
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(CorrelationMatrix, serializable=False)
|
||||
@ -0,0 +1,7 @@
|
||||
from algos_contrib.ExampleAlgo import ExampleAlgo
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False)
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'target_variable': ['a'],
|
||||
'feature_variables': ['b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options)
|
||||
@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.LatentDirichletAllocation import LatentDirichletAllocation
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'feature_variables': ['b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(LatentDirichletAllocation, required_methods, input_df, options)
|
||||
@ -0,0 +1,26 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.LinearSVC import LinearSVC
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'target_variable': ['a'],
|
||||
'feature_variables': ['b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(LinearSVC, required_methods , input_df, options)
|
||||
@ -0,0 +1,6 @@
|
||||
from algos_contrib.MDS import MDS
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(MDS, serializable=False)
|
||||
@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.MinMaxScaler import MinMaxScaler
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'feature_variables': ['a', 'b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(MinMaxScaler, required_methods, input_df, options)
|
||||
@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.NMF import NMF
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'feature_variables': ['a', 'b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(NMF, required_methods, input_df, options)
|
||||
@ -0,0 +1,26 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'target_variable': ['a'],
|
||||
'feature_variables': ['b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options)
|
||||
@ -0,0 +1,6 @@
|
||||
from algos_contrib.SavgolFilter import SavgolFilter
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(SavgolFilter, serializable=False)
|
||||
@ -0,0 +1,50 @@
|
||||
from algos_contrib.SVR import SVR
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def test_algo_basic():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'target_variable': ['a'],
|
||||
'feature_variables': ['b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(SVR, required_methods, input_df, options)
|
||||
|
||||
|
||||
def test_prediction():
|
||||
training_df = pd.DataFrame({
|
||||
'y': [1, 2, 3],
|
||||
'x1': [4, 5, 6],
|
||||
'x2': [7, 8, 9],
|
||||
})
|
||||
options = {
|
||||
'target_variable': ['y'],
|
||||
'feature_variables': ['x1', 'x2'],
|
||||
}
|
||||
test_df = pd.DataFrame({
|
||||
'x1': [4],
|
||||
'x2': [7],
|
||||
})
|
||||
|
||||
svr = SVR(options)
|
||||
svr.feature_variables = options['feature_variables']
|
||||
svr.target_variable = options['target_variable'][0]
|
||||
svr.fit(training_df, options)
|
||||
output = svr.apply(test_df, options)
|
||||
np.testing.assert_approx_equal(output['predicted(y)'].values, np.array([1.1]))
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
from algos_contrib.TFBinary import TFBinary
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(TFBinary, serializable=False)
|
||||
@ -0,0 +1,23 @@
|
||||
import pandas as pd
|
||||
from algos_contrib.TruncatedSVD import TruncatedSVD
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
def test_algo():
|
||||
input_df = pd.DataFrame({
|
||||
'a': [1, 2, 3],
|
||||
'b': [4, 5, 6],
|
||||
'c': ['a', 'b', 'c'],
|
||||
})
|
||||
options = {
|
||||
'feature_variables': ['a', 'b', 'c'],
|
||||
}
|
||||
required_methods = (
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'summary',
|
||||
'register_codecs',
|
||||
)
|
||||
AlgoTestUtils.assert_algo_basic(TruncatedSVD, required_methods, input_df, options)
|
||||
@ -0,0 +1,38 @@
|
||||
import pytest
|
||||
from algos_contrib.TSNE import TSNE
|
||||
from test.contrib_util import AlgoTestUtils
|
||||
|
||||
algo_options = {'feature_variables': ['Review']}
|
||||
|
||||
|
||||
def test_algo():
|
||||
AlgoTestUtils.assert_algo_basic(TSNE, serializable=False)
|
||||
|
||||
|
||||
def test_valid_params():
|
||||
algo_options['params'] = {'k': '1'}
|
||||
TSNE_algo = TSNE(algo_options)
|
||||
assert TSNE_algo.estimator.n_components == 1
|
||||
|
||||
|
||||
def test_invalid_params_k_not_int():
|
||||
algo_options['params'] = {'k': '0.1'}
|
||||
with pytest.raises((RuntimeError, ValueError)) as excinfo:
|
||||
_ = TSNE(algo_options)
|
||||
assert excinfo.match('Invalid value for k: must be an int')
|
||||
|
||||
|
||||
def test_invalid_params_k_not_valid():
|
||||
algo_options['params'] = {'k': '0'}
|
||||
with pytest.raises((RuntimeError, ValueError)) as excinfo:
|
||||
_ = TSNE(algo_options)
|
||||
assert excinfo.match('Invalid value for k: k must be greater than or equal to 1')
|
||||
|
||||
|
||||
def test_default_parameter_values():
|
||||
algo_options['params'] = {'k': '1'}
|
||||
TSNE_algo = TSNE(algo_options)
|
||||
assert TSNE_algo.estimator.n_iter == 200
|
||||
assert TSNE_algo.estimator.perplexity == 30.0
|
||||
assert TSNE_algo.estimator.early_exaggeration == 4.0
|
||||
assert TSNE_algo.estimator.learning_rate == 100
|
||||
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python
|
||||
""" Small utility to add the MLTK bin path to the system path.
|
||||
This makes it easy to import algorithms or utilities from the MLTK."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def check_splunk_home(splunk_home):
|
||||
""" Check SPLUNK_HOME and raise if not set."""
|
||||
if not splunk_home:
|
||||
raise RuntimeError('No $SPLUNK_HOME provided. Please set SPLUNK_HOME.')
|
||||
|
||||
|
||||
def get_mltk_bin_path(splunk_home):
|
||||
""" Create the path to the MLTK bin folder."""
|
||||
check_splunk_home(splunk_home)
|
||||
mltk_path = os.path.join(splunk_home, 'etc', 'apps', 'Splunk_ML_Toolkit', 'bin')
|
||||
|
||||
if not os.path.exists(mltk_path):
|
||||
raise RuntimeError('MLTK bin folder not found at {}: is MLTK installed?'.format(mltk_path))
|
||||
|
||||
return mltk_path
|
||||
|
||||
|
||||
def add_mltk():
|
||||
""" Adds MLTK bin path to sys.path """
|
||||
splunk_home = os.environ.get('SPLUNK_HOME', None)
|
||||
mltk_bin_path = get_mltk_bin_path(splunk_home)
|
||||
sys.path.insert(0, mltk_bin_path)
|
||||
@ -0,0 +1,5 @@
|
||||
from link_mltk import add_mltk
|
||||
add_mltk()
|
||||
|
||||
from test.util import check_signatures
|
||||
|
||||
@ -0,0 +1,182 @@
|
||||
""" Utility methods for use in testing."""
|
||||
import ConfigParser
|
||||
import json
|
||||
import os
|
||||
from inspect import getargspec
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from base import BaseAlgo
|
||||
from codec import MLSPLDecoder, MLSPLEncoder
|
||||
|
||||
|
||||
PACKAGE_NAME='algos_contrib'
|
||||
|
||||
|
||||
class AlgoTestUtils(object):
|
||||
"""
|
||||
Helper methods for testing algorithm implementations
|
||||
"""
|
||||
@staticmethod
|
||||
def assert_method_signature(algo_cls, method_name, args):
|
||||
"""
|
||||
Assert the signature of the specified method
|
||||
|
||||
Args:
|
||||
algo_cls (class): a custom algorithm class to check
|
||||
method_name (str): the name of the method
|
||||
args (list): expected arguments to the named method
|
||||
|
||||
Returns:
|
||||
(bool): True if the method is callable and has the specified arguments, False otherwise.
|
||||
|
||||
Raises:
|
||||
AssertionError
|
||||
"""
|
||||
method = getattr(algo_cls, method_name, None)
|
||||
assert method, "Method '{}' does not exist".format(method_name)
|
||||
assert callable(method), "Method '{}' is not callable".format(method_name)
|
||||
found_args = getargspec(method).args
|
||||
msg = 'Method {} has signature: {} - but should have {}'.format(method, args, found_args)
|
||||
assert found_args == args, msg
|
||||
|
||||
@classmethod
|
||||
def assert_registered(cls, algo_cls):
|
||||
"""
|
||||
Assert that the algorithm is registered in the algos.conf configuration file.
|
||||
|
||||
Args:
|
||||
algo_cls (class): a custom algorithm class to check
|
||||
|
||||
Returns:
|
||||
(bool): True if the method is registered in algos.conf file.
|
||||
|
||||
Raises:
|
||||
AssertionError
|
||||
"""
|
||||
config = ConfigParser.RawConfigParser()
|
||||
with cls.get_algos_conf_fp() as f:
|
||||
config.readfp(f)
|
||||
algo_name = algo_cls.__name__
|
||||
try:
|
||||
package_name = config.get(algo_name, 'package')
|
||||
except ConfigParser.NoSectionError:
|
||||
assert False, "'{}' not registered in algos.conf".format(algo_name)
|
||||
except ConfigParser.NoOptionError:
|
||||
assert False, "'{}' must override 'package' option in algos.conf".format(algo_name)
|
||||
|
||||
assert package_name == PACKAGE_NAME, "The package name must be '{}'".format(PACKAGE_NAME)
|
||||
|
||||
@staticmethod
|
||||
def assert_serializable(algo_cls, input_df, options):
|
||||
"""
|
||||
Assert that the model created by the algorithm is serializable.
|
||||
|
||||
Args:
|
||||
algo_cls (class): a custom algorithm class to check
|
||||
input_df (pandas Dataframe): input dataframe for the algorithm being tested
|
||||
options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm
|
||||
|
||||
Returns:
|
||||
(bool): True if the the model is serializable, False otherwise.
|
||||
|
||||
Raises:
|
||||
AssertionError
|
||||
"""
|
||||
assert hasattr(algo_cls, 'register_codecs')
|
||||
algo_cls.register_codecs()
|
||||
|
||||
algo_inst = algo_cls(options)
|
||||
algo_inst.feature_variables = ['b', 'c']
|
||||
algo_inst.target_variable = 'a'
|
||||
algo_inst.fit(input_df.copy(), options)
|
||||
|
||||
encoded = json.dumps(algo_inst, cls=MLSPLEncoder)
|
||||
decoded = json.loads(encoded, cls=MLSPLDecoder)
|
||||
|
||||
orig_y = algo_inst.apply(input_df.copy(), options)
|
||||
decoded_y = decoded.apply(input_df.copy(), options)
|
||||
pd.util.testing.assert_frame_equal(orig_y, decoded_y)
|
||||
|
||||
@classmethod
|
||||
def assert_base_algo_method_signatures(cls, algo_cls, required_methods=None):
|
||||
"""
|
||||
Assert that the signatures of algorithm's methods adhere to the API.
|
||||
|
||||
Args:
|
||||
algo_cls (class): a custom algorithm class to check.
|
||||
required_methods (list): list of required method names.
|
||||
'__init__' and 'fit' are always required, so
|
||||
they do not need to be included.
|
||||
|
||||
|
||||
Returns:
|
||||
(bool): True if the methods adhere to the API, False otherwise.
|
||||
|
||||
Raises:
|
||||
AssertionError
|
||||
"""
|
||||
method_args_map = {
|
||||
'__init__': ['self', 'options'],
|
||||
'fit': ['self', 'df', 'options'],
|
||||
'partial_fit': ['self', 'df', 'options'],
|
||||
'apply': ['self', 'df', 'options'],
|
||||
'summary': ['self', 'options'],
|
||||
'register_codecs': [],
|
||||
}
|
||||
|
||||
if required_methods is None:
|
||||
required_methods = []
|
||||
|
||||
assert issubclass(algo_cls, BaseAlgo), 'Algorithms must inherit from BaseAlgo.'
|
||||
|
||||
required_method_set = set(required_methods)
|
||||
extra_methods = required_method_set - method_args_map.viewkeys()
|
||||
assert extra_methods == set(), "'{}' not in BaseAlgo".format(", ".join(extra_methods))
|
||||
|
||||
# __init__ and fit are always required.
|
||||
required_method_set.add('__init__')
|
||||
required_method_set.add('fit')
|
||||
|
||||
for required_method in required_method_set:
|
||||
cls.assert_method_signature(algo_cls, required_method, method_args_map[required_method])
|
||||
|
||||
@classmethod
|
||||
def assert_algo_basic(cls, algo_cls, required_methods=None, input_df=None, options=None, serializable=True):
|
||||
"""
|
||||
Assert signatures of methods, registration, and serialization
|
||||
|
||||
Args:
|
||||
algo_cls (class): a custom algorithm class to check.
|
||||
input_df (pandas Dataframe): input dataframe for the algorithm being tested
|
||||
options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm
|
||||
serializable (bool): whether to check serializability or not.
|
||||
|
||||
Returns:
|
||||
(bool): True if the methods adhere to the API, False otherwise.
|
||||
|
||||
Raises:
|
||||
AssertionError
|
||||
"""
|
||||
cls.assert_base_algo_method_signatures(algo_cls, required_methods)
|
||||
cls.assert_registered(algo_cls)
|
||||
if serializable:
|
||||
# The input and options are required for serializability test.
|
||||
assert input_df is not None
|
||||
assert options is not None
|
||||
cls.assert_serializable(algo_cls, input_df, options)
|
||||
|
||||
@staticmethod
|
||||
def get_algos_conf_fp():
|
||||
"""
|
||||
Get a reference (pointer) to algos.conf file open for read
|
||||
|
||||
This method mainly exists to aid testing.
|
||||
|
||||
Returns:
|
||||
(File): algos.conf file pointer
|
||||
"""
|
||||
algos_file_path = os.path.join(os.path.dirname(__file__), '..', '..', 'default', 'algos.conf')
|
||||
return open(algos_file_path)
|
||||
|
||||
|
||||
@ -0,0 +1,152 @@
|
||||
import mock
|
||||
import io
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from base import BaseAlgo
|
||||
from util.base_util import MLSPLNotImplementedError
|
||||
|
||||
from contrib_util import AlgoTestUtils
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def min_algo_cls():
|
||||
class MinimalAlgo(BaseAlgo):
|
||||
pass
|
||||
return MinimalAlgo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def serializable_algo_cls():
|
||||
class SerializableAlgo(BaseAlgo):
|
||||
def __init__(self, options):
|
||||
pass
|
||||
|
||||
def fit(self, df, options):
|
||||
pass
|
||||
|
||||
def apply(self, df, options):
|
||||
return df
|
||||
|
||||
@classmethod
|
||||
def register_codecs(cls):
|
||||
from codec.codecs import SimpleObjectCodec
|
||||
from codec import codecs_manager
|
||||
codecs_manager.add_codec('test.test_contrib_util', 'SerializableAlgo', SimpleObjectCodec)
|
||||
|
||||
# Add the class to this module so that encoder and decoder can access it.
|
||||
# This is only necessary for a fixture function. Normally, these classes will be defined within a module.
|
||||
setattr(sys.modules[__name__], 'SerializableAlgo', SerializableAlgo)
|
||||
return SerializableAlgo
|
||||
|
||||
|
||||
mock_algo_conf = """
|
||||
[MinimalAlgo]
|
||||
package=algos_contrib
|
||||
"""
|
||||
|
||||
|
||||
mock_algo_conf_no_package = """
|
||||
[MinimalAlgo]
|
||||
"""
|
||||
|
||||
|
||||
def test_method_signature(min_algo_cls):
|
||||
AlgoTestUtils.assert_method_signature(min_algo_cls, 'fit', ['self', 'df', 'options'])
|
||||
|
||||
|
||||
@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
|
||||
def test_registered(mock_get_algos_conf_fp, min_algo_cls):
|
||||
AlgoTestUtils.assert_registered(min_algo_cls)
|
||||
|
||||
|
||||
def test_serializable(serializable_algo_cls):
|
||||
AlgoTestUtils.assert_serializable(serializable_algo_cls, input_df=pd.DataFrame({}), options={})
|
||||
|
||||
|
||||
def test_base_algo_method_signatures_default_methods(min_algo_cls):
|
||||
AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls)
|
||||
|
||||
|
||||
def test_base_algo_method_signatures_all_methods(min_algo_cls):
|
||||
AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'register_codecs',
|
||||
])
|
||||
|
||||
|
||||
def test_base_algo_method_signatures_extra_methods(min_algo_cls):
|
||||
with pytest.raises(AssertionError) as e:
|
||||
extra_args = [
|
||||
'extra1',
|
||||
'extra2',
|
||||
]
|
||||
AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[
|
||||
'__init__',
|
||||
'fit',
|
||||
'partial_fit',
|
||||
'apply',
|
||||
'register_codecs',
|
||||
] + extra_args)
|
||||
assert e.match('{}.*not in BaseAlgo'.format(extra_args))
|
||||
|
||||
|
||||
@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
|
||||
def test_algo_basic(mock_get_algos_conf_fp, min_algo_cls):
|
||||
AlgoTestUtils.assert_algo_basic(min_algo_cls, serializable=False)
|
||||
|
||||
|
||||
def test_no_base_algo():
|
||||
class NoBaseAlgo(object):
|
||||
pass
|
||||
|
||||
with pytest.raises(AssertionError) as e:
|
||||
AlgoTestUtils.assert_base_algo_method_signatures(NoBaseAlgo)
|
||||
assert e.match('must inherit from BaseAlgo')
|
||||
|
||||
|
||||
def test_method_signature_non_existent(min_algo_cls):
|
||||
bad_method = 'foot'
|
||||
with pytest.raises(AssertionError) as e:
|
||||
AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options'])
|
||||
e.match("{}.*does not exist".format(bad_method))
|
||||
|
||||
|
||||
def test_method_signature_not_callable(min_algo_cls):
|
||||
bad_method = 'fit'
|
||||
|
||||
# Make fit a property.
|
||||
min_algo_cls.fit = 'fit'
|
||||
|
||||
with pytest.raises(AssertionError) as e:
|
||||
AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options'])
|
||||
e.match("{}.*not callable".format(bad_method))
|
||||
|
||||
|
||||
@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
|
||||
def test_unregistered(mock_get_algos_conf_fp):
|
||||
class UnregisteredAlgo(BaseAlgo):
|
||||
pass
|
||||
|
||||
with pytest.raises(AssertionError) as e:
|
||||
AlgoTestUtils.assert_registered(UnregisteredAlgo)
|
||||
assert e.match('{}.*not registered'.format(UnregisteredAlgo.__name__))
|
||||
|
||||
|
||||
@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf_no_package))
|
||||
def test_registered_with_missing_package_option(mock_get_algos_conf_fp, min_algo_cls):
|
||||
with pytest.raises(AssertionError) as e:
|
||||
AlgoTestUtils.assert_registered(min_algo_cls)
|
||||
assert e.match('{}.*must override.*package'.format(min_algo_cls.__name__))
|
||||
|
||||
|
||||
def test_not_serializable(min_algo_cls):
|
||||
with pytest.raises(MLSPLNotImplementedError) as e:
|
||||
AlgoTestUtils.assert_serializable(min_algo_cls, input_df=pd.DataFrame({}), options={})
|
||||
assert e.match('does not support saving')
|
||||
|
||||
|
||||
@ -0,0 +1,62 @@
|
||||
# Here is where algorithms are registered.
|
||||
[default]
|
||||
|
||||
########################################################################
|
||||
# Due to the layering of configuration files in Splunk, we have to
|
||||
# override the package name in every section.
|
||||
########################################################################
|
||||
|
||||
|
||||
[AgglomerativeClustering]
|
||||
package=algos_contrib
|
||||
|
||||
[CorrelationMatrix]
|
||||
package=algos_contrib
|
||||
|
||||
[ExampleAlgo]
|
||||
package=algos_contrib
|
||||
|
||||
[SVR]
|
||||
package=algos_contrib
|
||||
|
||||
[SavgolFilter]
|
||||
package=algos_contrib
|
||||
|
||||
[TSNE]
|
||||
package=algos_contrib
|
||||
|
||||
[MDS]
|
||||
package=algos_contrib
|
||||
|
||||
[OrthogonalMatchingPursuit]
|
||||
package=algos_contrib
|
||||
|
||||
[TruncatedSVD]
|
||||
package=algos_contrib
|
||||
|
||||
[LatentDirichletAllocation]
|
||||
package=algos_contrib
|
||||
|
||||
[NMF]
|
||||
package=algos_contrib
|
||||
|
||||
[CollaborativeFilter]
|
||||
package=algos_contrib
|
||||
|
||||
[CustomDecisionTreeClassifier]
|
||||
package=algos_contrib
|
||||
|
||||
[TFBinary]
|
||||
package = algos_contrib
|
||||
|
||||
[MinMaxScaler]
|
||||
package = algos_contrib
|
||||
|
||||
[LinearSVC]
|
||||
package = algos_contrib
|
||||
|
||||
[ExtraTreesClassifier]
|
||||
package = algos_contrib
|
||||
|
||||
[IsolationForest]
|
||||
package = algos_contrib
|
||||
@ -0,0 +1,18 @@
|
||||
#
|
||||
# Splunk app configuration file
|
||||
#
|
||||
|
||||
[install]
|
||||
is_configured = 0
|
||||
|
||||
[package]
|
||||
id = SA_mltk_contrib_app
|
||||
|
||||
[ui]
|
||||
is_visible = false
|
||||
label = Splunk MLTK Algorithms on GitHub
|
||||
|
||||
[launcher]
|
||||
author = Gyanendra Rana
|
||||
description = An app based on Open Source GitHub repo for Splunk Machine Learning Toolkit Algorithms
|
||||
version = 1.0
|
||||
@ -0,0 +1,7 @@
|
||||
<nav search_view="search" color="#65A637">
|
||||
<view name="search" default='true' />
|
||||
<view name="datasets" />
|
||||
<view name="reports" />
|
||||
<view name="alerts" />
|
||||
<view name="dashboards" />
|
||||
</nav>
|
||||
@ -0,0 +1 @@
|
||||
Add all the views that your app needs in this directory
|
||||
@ -0,0 +1,39 @@
|
||||
|
||||
# Application-level permissions
|
||||
|
||||
[]
|
||||
access = read : [ * ], write : [ admin, power ]
|
||||
|
||||
### EVENT TYPES
|
||||
|
||||
[eventtypes]
|
||||
export = system
|
||||
|
||||
|
||||
### PROPS
|
||||
|
||||
[props]
|
||||
export = system
|
||||
|
||||
|
||||
### TRANSFORMS
|
||||
|
||||
[transforms]
|
||||
export = system
|
||||
|
||||
|
||||
### LOOKUPS
|
||||
|
||||
[lookups]
|
||||
export = system
|
||||
|
||||
|
||||
### VIEWSTATES: even normal users should be able to create shared viewstates
|
||||
|
||||
[viewstates]
|
||||
access = read : [ * ], write : [ * ]
|
||||
export = system
|
||||
|
||||
|
||||
[algos]
|
||||
export = system
|
||||
@ -0,0 +1,233 @@
|
||||
{
|
||||
"version": "1.0",
|
||||
"date": "2023-04-26T14:53:07.461206713Z",
|
||||
"hashAlgorithm": "SHA-256",
|
||||
"app": {
|
||||
"id": 4403,
|
||||
"version": "1.0",
|
||||
"files": [
|
||||
{
|
||||
"path": "LICENSE",
|
||||
"hash": "c71d239df91726fc519c6eb72d318ec65820627232b2f796219e87dcf35d0ab4"
|
||||
},
|
||||
{
|
||||
"path": "bin/README.md",
|
||||
"hash": "597cdad620bec4e52e0e8adc3cad99de9b3ce45da0dd18e4159e1009c976e957"
|
||||
},
|
||||
{
|
||||
"path": "bin/test/test_contrib_util.py",
|
||||
"hash": "f521bae6ecd4bf13d969fc9ba8fd8b1948fbd59e9dadaf9a1355f5a549cdbe32"
|
||||
},
|
||||
{
|
||||
"path": "bin/test/__init__.py",
|
||||
"hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
|
||||
},
|
||||
{
|
||||
"path": "bin/test/contrib_util.py",
|
||||
"hash": "7c24c1ced03aacce92c29095be20d494cba5c0ed1fc7fa167c713c061c151ffd"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/MinMaxScaler.py",
|
||||
"hash": "59752f166603a4c85c057acd12bb155be5e98e8c89d4c2944fda247daf860802"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/ExampleAlgo.py",
|
||||
"hash": "1f67435c2ad60172129e45d6d4d7f8d0ce6c47df1603771166e35a3ee96fc5b4"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/CustomDecisionTreeClassifier.py",
|
||||
"hash": "8ce1dc0b4dcf774ca1d6e327b737d695d8e091bda2664a756f31da6921e52abd"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/CorrelationMatrix.py",
|
||||
"hash": "9ab4f8070c695a744c3a1219b27bf265c7044756a9cfdcb507e40a0a7861213a"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/TSNE.py",
|
||||
"hash": "7c0f772ca89df4480939285ee33857d5eadcad8bb73d80ae834835dad98439b7"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/ExtraTreesClassifier.py",
|
||||
"hash": "c9c085a39267c0fd8e32ac3830501fc095b431ded57ba7ddc79d05c6576f756c"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/TruncatedSVD.py",
|
||||
"hash": "d16e383f59b216aae3fd6725632a431c4c54c1e0b2a4f4a6ac203f518d6bca39"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/AgglomerativeClustering.py",
|
||||
"hash": "d121cb6ff52f06975777c2f155a9f6d3e58c6aa3c1156bdca3ed65d7e33f22ef"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/SVR.py",
|
||||
"hash": "385c74c4cefdbbb972de4fb9eff78cde879133596285653dad8f3eb00f3840e2"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/SavgolFilter.py",
|
||||
"hash": "9b174720370b5425f4fec2364e02e9dd7e1e76bdeedc55a7087b232069d58649"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/CollaborativeFilter.py",
|
||||
"hash": "94384d011c281796c8091ed12f812b4781cfff7f633fc47e6873489a445685b1"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/TFBinary.py",
|
||||
"hash": "399b00a32d2a445fd2dee2cc42a0bde976a1d7640417e5ef4ee64ab60b7e917f"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/LatentDirichletAllocation.py",
|
||||
"hash": "2c3eec09909771fe0868e4e666b536ba3be76034011bdcc2c7b830f9b65d716e"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/__init__.py",
|
||||
"hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/LinearSVC.py",
|
||||
"hash": "bf55507202eb0743d0001ca3ae861cc7bd14a9eaffc2424e983039f6e724c736"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/OrthogonalMatchingPursuit.py",
|
||||
"hash": "7916076ba3f1bca181d1ac3efbc21e57583376612cf34dd70378c4f0002c91a6"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/MDS.py",
|
||||
"hash": "c43efcffee0dd3422967e64ccbd8dd5bb7c34da6f2e8a3b3605221251b065b12"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/IsolationForest.py",
|
||||
"hash": "72fc6e8342f9eaf130570cc2e41a903624226d847ae4259e2cd4983d930c8298"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/NMF.py",
|
||||
"hash": "62054702577674e0637c94c24026b541b2fa1741ae9dc2a740ab29779550eef2"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_linear_svc.py",
|
||||
"hash": "6a4cabc9a6617f9ee4e9502b71c7532bb7b3adf09ed9164ca63f7f5d98ef76e5"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_tsne.py",
|
||||
"hash": "0f7517b0dbe6f0c373223605fdd7937eedd8d77024dda48cd2af86631cd1c6f8"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_svr.py",
|
||||
"hash": "c59ed4d4217a71408d69003f77feb7ca3160369a4e58570fc34c12269ac934eb"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_truncated_svd.py",
|
||||
"hash": "0f306f7b2395c94c378571ff7040e4907c66f718520c5b59b28a0437fa74a97f"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_IsolationForest.py",
|
||||
"hash": "eff84b1f9ef5802a20e8e928c7c60ebfd6ecc8f4e71cc40aaea9ae540ecb64ad"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_min_max_scaler.py",
|
||||
"hash": "577fdac81ed8e78c2c0b4142d477373f1552ba9c48e04cb3683e4badb6b16e39"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_mds.py",
|
||||
"hash": "8090f4fb85746f3ed42049eb967048fb389bc06d8cc9f08432a948980eceff78"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_latent_dirichlet_allocation.py",
|
||||
"hash": "dfaa22fd3ec482b67bd04b9780eab6f103de4b2255f4ebd272fd42a28ee3468d"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_collaborativefilter.py",
|
||||
"hash": "9e4b5115862ec45f3c6fcf77db75c8603fd7920d990b512d17b3f9b013df1668"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py",
|
||||
"hash": "96e67dea269fcaf38005a34de4baee6205007d905d4bdfe5b2750f3da79e44c9"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_correlation_matrix.py",
|
||||
"hash": "a47a1ca416ed9553aaa81d749198c2d578c190704168b0c2fc8cd7c81c196119"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/__init__.py",
|
||||
"hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_example_algo.py",
|
||||
"hash": "e4f9509b4a9c9d30cfc8ab5faf5424de04b13713d5be1d1c4d82aa988426ca59"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_extra_trees_classifier.py",
|
||||
"hash": "7aa79278f98010183abb1be6c937c456bc611f9233aa9199be0030533e8f9ee9"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_savgol_filter.py",
|
||||
"hash": "586e613e41b819b67e239ed316aed90e6a7b9602e3f31d8e41da9b819846013d"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_agglomerative_clustering.py",
|
||||
"hash": "5afa871420153c1f0e82cde120d675b89a7048b03a9b6cf17de20316d3f3dc3d"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py",
|
||||
"hash": "7aa79278f98010183abb1be6c937c456bc611f9233aa9199be0030533e8f9ee9"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_tf_binary.py",
|
||||
"hash": "48761919a278357b5ef9a41a0fe79d80a139a9bf6531e27c69dbf01368cadc2d"
|
||||
},
|
||||
{
|
||||
"path": "bin/algos_contrib/tests/test_nmf.py",
|
||||
"hash": "246c86e04b3875907c4f70669efd05df3eebad8016b0dd24019c74e12f10ed08"
|
||||
},
|
||||
{
|
||||
"path": "bin/test.py",
|
||||
"hash": "cacc66edf525a77a5cb451770360701533ef454391a96be6044179f7df1ca9d1"
|
||||
},
|
||||
{
|
||||
"path": "bin/link_mltk.py",
|
||||
"hash": "d94b783b59e249590eacfed4a01edb723f7671c51cef7c6730ba8c41c0e7fa7e"
|
||||
},
|
||||
{
|
||||
"path": "default/algos.conf",
|
||||
"hash": "d8a2c63b1406b31f5c98c0bced406895ecd02104455b098fcbf8672c459751f0"
|
||||
},
|
||||
{
|
||||
"path": "default/data/ui/views/README.md",
|
||||
"hash": "4ccd9dc2dca5bd634f7c07ad1749e4e63a7969c84e2eff83517256f7c884cd29"
|
||||
},
|
||||
{
|
||||
"path": "default/data/ui/nav/default.xml",
|
||||
"hash": "e5e0678bca27efa4ded83f8f83a7f2ef10291a4d66fa43ac4f95ce735fb3e824"
|
||||
},
|
||||
{
|
||||
"path": "default/app.conf",
|
||||
"hash": "9ca504d6baa4020f4583d9a950bf259bd706882243483dff2834e787fc376174"
|
||||
},
|
||||
{
|
||||
"path": "metadata/default.meta",
|
||||
"hash": "721109ec9f1724ee76ce3d9a4ef68ab1f27dac40ee213f147744e3028b5090ac"
|
||||
}
|
||||
]
|
||||
},
|
||||
"products": [
|
||||
{
|
||||
"platform": "splunk",
|
||||
"product": "enterprise",
|
||||
"versions": [
|
||||
"7.0",
|
||||
"7.1",
|
||||
"7.2"
|
||||
],
|
||||
"architectures": [
|
||||
"x86_64"
|
||||
],
|
||||
"operatingSystems": [
|
||||
"windows",
|
||||
"linux",
|
||||
"macos",
|
||||
"freebsd",
|
||||
"solaris",
|
||||
"aix"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
Reference in new issue