From 56995569d15fc6a032f0b4d77e41930e888a18a7 Mon Sep 17 00:00:00 2001 From: admingit Date: Mon, 8 Jan 2024 17:14:59 +0100 Subject: [PATCH] add_algo_MLTK --- deployment-apps/SA_mltk_contrib_app/LICENSE | 201 +++++++++++++++ .../SA_mltk_contrib_app/bin/README.md | 1 + .../algos_contrib/AgglomerativeClustering.py | 92 +++++++ .../bin/algos_contrib/CollaborativeFilter.py | 109 ++++++++ .../bin/algos_contrib/CorrelationMatrix.py | 57 +++++ .../CustomDecisionTreeClassifier.py | 66 +++++ .../bin/algos_contrib/ExampleAlgo.py | 9 + .../bin/algos_contrib/ExtraTreesClassifier.py | 51 ++++ .../bin/algos_contrib/IsolationForest.py | 128 ++++++++++ .../LatentDirichletAllocation.py | 35 +++ .../bin/algos_contrib/LinearSVC.py | 29 +++ .../bin/algos_contrib/MDS.py | 83 +++++++ .../bin/algos_contrib/MinMaxScaler.py | 60 +++++ .../bin/algos_contrib/NMF.py | 31 +++ .../OrthogonalMatchingPursuit.py | 38 +++ .../bin/algos_contrib/SVR.py | 27 ++ .../bin/algos_contrib/SavgolFilter.py | 48 ++++ .../bin/algos_contrib/TFBinary.py | 124 ++++++++++ .../bin/algos_contrib/TSNE.py | 83 +++++++ .../bin/algos_contrib/TruncatedSVD.py | 30 +++ .../bin/algos_contrib/__init__.py | 0 .../bin/algos_contrib/tests/__init__.py | 0 .../test_CustomDecisionTreeClassifier.py | 23 ++ .../tests/test_IsolationForest.py | 26 ++ .../tests/test_agglomerative_clustering.py | 6 + .../tests/test_collaborativefilter.py | 6 + .../tests/test_correlation_matrix.py | 6 + .../algos_contrib/tests/test_example_algo.py | 7 + .../tests/test_extra_trees_classifier.py | 26 ++ .../tests/test_latent_dirichlet_allocation.py | 23 ++ .../algos_contrib/tests/test_linear_svc.py | 26 ++ .../bin/algos_contrib/tests/test_mds.py | 6 + .../tests/test_min_max_scaler.py | 23 ++ .../bin/algos_contrib/tests/test_nmf.py | 23 ++ .../tests/test_orthogonal_matching_pursuit.py | 26 ++ .../algos_contrib/tests/test_savgol_filter.py | 6 + .../bin/algos_contrib/tests/test_svr.py | 50 ++++ .../bin/algos_contrib/tests/test_tf_binary.py | 6 + .../algos_contrib/tests/test_truncated_svd.py | 23 ++ .../bin/algos_contrib/tests/test_tsne.py | 38 +++ .../SA_mltk_contrib_app/bin/link_mltk.py | 29 +++ .../SA_mltk_contrib_app/bin/test.py | 5 + .../SA_mltk_contrib_app/bin/test/__init__.py | 0 .../bin/test/contrib_util.py | 182 ++++++++++++++ .../bin/test/test_contrib_util.py | 152 ++++++++++++ .../SA_mltk_contrib_app/default/algos.conf | 62 +++++ .../SA_mltk_contrib_app/default/app.conf | 18 ++ .../default/data/ui/nav/default.xml | 7 + .../default/data/ui/views/README.md | 1 + .../SA_mltk_contrib_app/metadata/default.meta | 39 +++ .../SA_mltk_contrib_app/splunkbase.manifest | 233 ++++++++++++++++++ 51 files changed, 2380 insertions(+) create mode 100644 deployment-apps/SA_mltk_contrib_app/LICENSE create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/README.md create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/AgglomerativeClustering.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CollaborativeFilter.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CorrelationMatrix.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CustomDecisionTreeClassifier.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExampleAlgo.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExtraTreesClassifier.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/IsolationForest.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LatentDirichletAllocation.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LinearSVC.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MDS.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MinMaxScaler.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/NMF.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/OrthogonalMatchingPursuit.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SVR.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SavgolFilter.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TFBinary.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TSNE.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TruncatedSVD.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/__init__.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/__init__.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_IsolationForest.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_agglomerative_clustering.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_collaborativefilter.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_correlation_matrix.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_example_algo.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_extra_trees_classifier.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_linear_svc.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_mds.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_min_max_scaler.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_nmf.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_savgol_filter.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_svr.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tf_binary.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_truncated_svd.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tsne.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/link_mltk.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/test.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/test/__init__.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/test/contrib_util.py create mode 100644 deployment-apps/SA_mltk_contrib_app/bin/test/test_contrib_util.py create mode 100644 deployment-apps/SA_mltk_contrib_app/default/algos.conf create mode 100644 deployment-apps/SA_mltk_contrib_app/default/app.conf create mode 100644 deployment-apps/SA_mltk_contrib_app/default/data/ui/nav/default.xml create mode 100644 deployment-apps/SA_mltk_contrib_app/default/data/ui/views/README.md create mode 100644 deployment-apps/SA_mltk_contrib_app/metadata/default.meta create mode 100644 deployment-apps/SA_mltk_contrib_app/splunkbase.manifest diff --git a/deployment-apps/SA_mltk_contrib_app/LICENSE b/deployment-apps/SA_mltk_contrib_app/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/deployment-apps/SA_mltk_contrib_app/bin/README.md b/deployment-apps/SA_mltk_contrib_app/bin/README.md new file mode 100644 index 00000000..9a70db09 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/README.md @@ -0,0 +1 @@ +This is where you put any scripts you want to add to this app. diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/AgglomerativeClustering.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/AgglomerativeClustering.py new file mode 100644 index 00000000..72bd7365 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/AgglomerativeClustering.py @@ -0,0 +1,92 @@ +import numpy as np +from sklearn.cluster import AgglomerativeClustering as AgClustering +from sklearn.metrics import silhouette_samples + +from base import BaseAlgo +from util.param_util import convert_params +from util import df_util + + +class AgglomerativeClustering(BaseAlgo): + """Use scikit-learn's AgglomerativeClustering algorithm to cluster data.""" + + def __init__(self, options): + + feature_variables = options.get('feature_variables', {}) + target_variable = options.get('target_variable', {}) + + # Ensure fields are present + if len(feature_variables) == 0: + raise RuntimeError('You must supply one or more fields') + + # No from clause allowed + if len(target_variable) > 0: + raise RuntimeError('AgglomerativeClustering does not support the from clause') + + # Convert params & alias k to n_clusters + params = options.get('params', {}) + out_params = convert_params( + params, + ints=['k'], + strs=['linkage', 'affinity'], + aliases={'k': 'n_clusters'} + ) + + # Check for valid linkage + if 'linkage' in out_params: + valid_linkage = ['ward', 'complete', 'average'] + if out_params['linkage'] not in valid_linkage: + raise RuntimeError('linkage must be one of: {}'.format(', '.join(valid_linkage))) + + # Check for valid affinity + if 'affinity' in out_params: + valid_affinity = ['l1', 'l2', 'cosine', 'manhattan', + 'precomputed', 'euclidean'] + + if out_params['affinity'] not in valid_affinity: + raise RuntimeError('affinity must be one of: {}'.format(', '.join(valid_affinity))) + + # Check for invalid affinity & linkage combination + if 'linkage' in out_params and 'affinity' in out_params: + if out_params['linkage'] == 'ward': + if out_params['affinity'] != 'euclidean': + raise RuntimeError('ward linkage (default) must use euclidean affinity (default)') + + # Initialize the estimator + self.estimator = AgClustering(**out_params) + + def fit(self, df, options): + """Do the clustering & merge labels with original data.""" + # Make a copy of the input data + X = df.copy() + + # Use the df_util prepare_features method to + # - drop null columns & rows + # - convert categorical columns into dummy indicator columns + # X is our cleaned data, nans is a mask of the null value locations + X, nans, columns = df_util.prepare_features(X, self.feature_variables) + + # Do the actual clustering + y_hat = self.estimator.fit_predict(X.values) + + # attach silhouette coefficient score for each row + silhouettes = silhouette_samples(X, y_hat) + + # Combine the two arrays, and transpose them. + y_hat = np.vstack([y_hat, silhouettes]).T + + # Assign default output names + default_name = 'cluster' + + # Get the value from the as-clause if present + output_name = options.get('output_name', default_name) + + # There are two columns - one for the labels, for the silhouette scores + output_names = [output_name, 'silhouette_score'] + + # Use the predictions & nans-mask to create a new dataframe + output_df = df_util.create_output_dataframe(y_hat, nans, output_names) + + # Merge the dataframe with the original input data + df = df_util.merge_predictions(df, output_df) + return df diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CollaborativeFilter.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CollaborativeFilter.py new file mode 100644 index 00000000..e6474416 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CollaborativeFilter.py @@ -0,0 +1,109 @@ + +from base import BaseAlgo +import pandas as pd +import numpy as np + +from sklearn.metrics.pairwise import pairwise_distances +from cexc import get_logger +from util import df_util +from util.param_util import convert_params + +# Everyone's favorite in memory collaborative filter, not a scaleable solution for millions of users and millions of items +# https://en.wikipedia.org/wiki/Collaborative_filtering +# please check out more scaleable solutions in KNN or "Recommender Systems: The Textbook" +# TODO add coldstart solution for nulls +# TODO currently we assume a |fillnull value=0 is run in splunk prior to calling the algorithm + +# We ASSUME rows are users, columns are items. +# TODO I seem to cause splunk memory issues with wide tables, so I should consider doing an XYSERIES like reshape +# TODO and consider taking in a table of USERID, ITEM , RATING from splunk. Yucky. + +# TODO There are many many many other distance metrics that could be a good fit. + + +class CollaborativeFilter(BaseAlgo): + def __init__(self, options): + + + # set parameters + params = options.get('params', {}) + out_params = convert_params( + params, + strs=['user_field','rating_type','coldstart_field'] + ) + + # set defaults for parameters + if 'user_field' in out_params: + self.user_field = out_params['user_field'] + else: + self.user_field = "SME" + + self.rating_type="item" + if 'rating_type' in out_params: + if out_params['rating_type'] == "item": + self.rating_type="item" + elif out_params['rating_type'] == "user": + self.rating_type="user" + + + def fit(self, df, options): + # df contains all the search results, including hidden fields + # but the requested requested are saved as self.feature_variables + logger = get_logger('MyCustomLogging') + + X=df.copy() + + # it is always best practice to prepare your data. + # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only + # want the features that are valid field names. + + + #Make sure to turn off get_dummies + X, _, self.columns = df_util.prepare_features( + X=X, + variables=self.feature_variables, + get_dummies=False, + mlspl_limits=options.get('mlspl_limits'), + ) + + # test if user field is in the list + logger.debug("The user field is %s",self.user_field ) + try: + my_list_index=(X[self.user_field].values) + except: + raise RuntimeError('You must specify user field that exists. You sent %s',self.user_field) + + X=X.drop([self.user_field],axis=1) + my_list_header=(X.columns.values) + + #ratings as a matrix , clean that data up! + X=X.replace([np.inf, -np.inf], "nan").replace("nan","0") + matrix=X.values + # force type for Numpy Math + matrix=matrix.astype(np.float64) + + # should consider erroring out when you have super sparse user data + # TODO add other methods via parameter + user_sim = pairwise_distances(matrix, metric='cosine') + item_sim = pairwise_distances(matrix.T, metric='cosine') + + #item prediction + item_sim= matrix.dot(item_sim) / np.array([np.abs(item_sim).sum(axis=1)]) + + #user sim + mean_user_rating = matrix.mean(axis=1) + matrix_diff = (matrix - mean_user_rating[:, np.newaxis]) + user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T + + # add back into the matrix the header row + if self.rating_type == "item": + output_df=pd.DataFrame(item_sim,columns=my_list_header, index=my_list_index) + if self.rating_type == "user": + output_df=pd.DataFrame(user_sim,columns=my_list_header, index=my_list_index) + output_df[self.user_field]=pd.Series(my_list_index).values + + return output_df + + + + diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CorrelationMatrix.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CorrelationMatrix.py new file mode 100644 index 00000000..1dd5075c --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CorrelationMatrix.py @@ -0,0 +1,57 @@ +from base import BaseAlgo + + +class CorrelationMatrix(BaseAlgo): + """Compute and return a correlation matrix.""" + + def __init__(self, options): + """Check for valid correlation type, and save it to an attribute on self.""" + + feature_variables = options.get('feature_variables', {}) + target_variable = options.get('target_variable', {}) + + if len(feature_variables) == 0: + raise RuntimeError('You must supply one or more fields') + + if len(target_variable) > 0: + raise RuntimeError('CorrelationMatrix does not support the from clause') + + valid_methods = ['spearman', 'kendall', 'pearson'] + + # Check to see if parameters exist + params = options.get('params', {}) + + # Check if method is in parameters in search + if 'method' in params: + if params['method'] not in valid_methods: + error_msg = 'Invalid value for method: must be one of {}'.format( + ', '.join(valid_methods)) + raise RuntimeError(error_msg) + + # Assign method to self for later usage + self.method = params['method'] + + # Assign default method and ensure no other parameters are present + else: + # Default method for correlation + self.method = 'pearson' + + # Check for bad parameters + if len(params) > 0: + raise RuntimeError('The only valid parameter is method.') + + def fit(self, df, options): + """Compute the correlations and return a DataFrame.""" + + # df contains all the search results, including hidden fields + # but the requested requested are saved as self.feature_variables + requested_columns = df[self.feature_variables] + + # Get correlations + correlations = requested_columns.corr(method=self.method) + + # Reset index so that all the data are in columns + # (this is necessary for the corr method) + output_df = correlations.reset_index() + + return output_df diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CustomDecisionTreeClassifier.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CustomDecisionTreeClassifier.py new file mode 100644 index 00000000..1a851f8a --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/CustomDecisionTreeClassifier.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +from sklearn.tree import DecisionTreeClassifier as _DecisionTreeClassifier +from base import ClassifierMixin, BaseAlgo +from codec import codecs_manager +from util.param_util import convert_params +from util.algo_util import tree_summary + +#This algorithm is an updated version of DecisionTreecClassifier from MLTK and class weight parameter has been added to it + +class CustomDecisionTreeClassifier(ClassifierMixin, BaseAlgo): + def __init__(self, options): + self.handle_options(options) + + out_params = convert_params( + options.get('params', {}), + ints=['random_state', 'max_depth', 'min_samples_split', 'max_leaf_nodes'], + strs=['criterion', 'splitter', 'max_features', 'class_weight'], + ) + + # whitelist valid values for criterion, as error raised by sklearn for invalid values is uninformative + if 'criterion' in out_params: + try: + assert (out_params['criterion'] in ['gini', 'entropy']) + except AssertionError: + raise RuntimeError('Invalid value for option criterion: "%s"' % out_params['criterion']) + + # whitelist valid values for splitter, as error raised by sklearn for invalid values is uninformative + if 'splitter' in out_params: + try: + assert (out_params['splitter'] in ['best', 'random']) + except AssertionError: + raise RuntimeError('Invalid value for option splitter: "%s"' % out_params['splitter']) + + if 'max_depth' not in out_params: + out_params.setdefault('max_leaf_nodes', 2000) + + # EAFP... convert max_features to int or float if it is a number. + try: + out_params['max_features'] = float(out_params['max_features']) + max_features_int = int(out_params['max_features']) + if out_params['max_features'] == max_features_int: + out_params['max_features'] = max_features_int + except: + pass + + if 'class_weight' in out_params: + try: + from ast import literal_eval + out_params['class_weight'] = literal_eval(out_params['class_weight']) + except Exception: + raise RuntimeError('Invalid value for option class_weight: "%s"' % out_params['class_weight']) + + self.estimator = _DecisionTreeClassifier(**out_params) + + def summary(self, options): + if 'args' in options: + raise RuntimeError('Summarization does not take values other than parameters') + return tree_summary(self, options) + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec, TreeCodec + codecs_manager.add_codec('algos_contrib.CustomDecisionTreeClassifier', 'CustomDecisionTreeClassifier', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeClassifier', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExampleAlgo.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExampleAlgo.py new file mode 100644 index 00000000..4c199e06 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExampleAlgo.py @@ -0,0 +1,9 @@ +from base import BaseAlgo + + +class ExampleAlgo(BaseAlgo): + def __init__(self, options): + pass + + def fit(self, df, options): + return df diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExtraTreesClassifier.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExtraTreesClassifier.py new file mode 100644 index 00000000..f3459677 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/ExtraTreesClassifier.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +from pandas import DataFrame +from sklearn.ensemble import ExtraTreesClassifier as _ExtraTreesClassifier + +from base import ClassifierMixin, BaseAlgo +from codec import codecs_manager +from util.param_util import convert_params +from util.algo_util import handle_max_features + + +class ExtraTreesClassifier(ClassifierMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + + out_params = convert_params( + options.get('params', {}), + ints=['random_state', 'n_estimators', 'max_depth', + 'min_samples_split', 'max_leaf_nodes'], + strs=['max_features', 'criterion'], + ) + + if 'max_depth' not in out_params: + out_params.setdefault('max_leaf_nodes', 2000) + + if 'max_features' in out_params: + out_params['max_features'] = handle_max_features(out_params['max_features']) + + self.estimator = _ExtraTreesClassifier(class_weight='balanced', + **out_params) + + def summary(self, options): + if len(options) != 2: # only model name and mlspl_limits + raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) + df = DataFrame({ + 'feature': self.columns, + 'importance': self.estimator.feature_importances_.ravel() + }) + return df + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec, TreeCodec + codecs_manager.add_codec('algos_contrib.ExtraTreesClassifier', + 'ExtraTreesClassifier', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.ensemble.forest', + 'ExtraTreesClassifier', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeClassifier', + SimpleObjectCodec) + codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/IsolationForest.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/IsolationForest.py new file mode 100644 index 00000000..227ead01 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/IsolationForest.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python + +from sklearn.ensemble import IsolationForest as _IsolationForest +import numpy as np +import pandas as pd + +from base import ClustererMixin, BaseAlgo +from codec import codecs_manager +from codec.codecs import BaseCodec +from codec.flatten import flatten, expand +from util import df_util +from util.param_util import convert_params +from cexc import get_messages_logger,get_logger + +class IsolationForest(ClustererMixin, BaseAlgo): + """ + This is the implementation wrapper around Isolation Forest from scikit-learn. It inherits methods from ClustererMixin and BaseAlgo. + """ + def __init__(self,options): + self.handle_options(options) + out_params = convert_params( + options.get('params',{}), + ints = ['n_estimators','n_jobs','random_state','verbose'], + floats = ['max_samples','contamination','max_features'], + bools = ['bootstrap'] + ) + self.return_scores = out_params.pop('anomaly_score', True) + + # whitelist n_estimators > 0 + if 'n_estimators' in out_params and out_params['n_estimators']<=0: + msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".' + raise RuntimeError(msg.format(out_params['n_estimators'])) + + # whitelist max_samples > 0 and < 1 + if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1: + msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".' + raise RuntimeError(msg.format(out_params['max_samples'])) + + # whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range + if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5): + msg = ( + 'Invalid value error: Valid values for contamination are in (0.0, 0.5], ' + 'but found contamination="{}".' + ) + raise RuntimeError(msg.format(out_params['contamination'])) + + # whitelist max_features > 0 and < 1 + if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1: + msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".' + raise RuntimeError(msg.format(out_params['max_features'])) + + + self.estimator = _IsolationForest(**out_params) + + + def apply(self, df, options): + # Make a copy of data, to not alter original dataframe + logger = get_logger('IsolationForest Logger') + X = df.copy() + + X, nans, _ = df_util.prepare_features( + X=X, + variables=self.feature_variables, + final_columns=self.columns, + mlspl_limits=options.get('mlspl_limits'), + ) + + # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1. + y_hat = self.estimator.predict(X.values)*-1 + # Printing the accuracy for prediction of outliers + accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2))) + logger.debug(accuracy) + + y_hat = y_hat.astype('str') + + #Assign output_name + default_name = 'isOutlier' + new_name = options.get('output_name', None) + output_name = self.rename_output(default_names=default_name, new_names=new_name) + + # Create output dataframe + output = df_util.create_output_dataframe( + y_hat=y_hat, nans=nans, output_names=output_name + ) + # Merge with original dataframe + output = df_util.merge_predictions(df, output) + return output + + def rename_output(self, default_names, new_names=None): + """Utility hook to rename output. + + The default behavior is to take the default_names passed in and simply + return them. If however a particular algo needs to rename the columns of + the output, this method can be overridden. + """ + return new_names if new_names is not None else default_names + + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec, TreeCodec + codecs_manager.add_codec('algos.IsolationForest', 'IsolationForest', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.ensemble.iforest', 'IsolationForest', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.tree.tree','ExtraTreeRegressor', ExtraTreeRegressorCodec) + codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) + + +class ExtraTreeRegressorCodec(BaseCodec): + """ + This is an ExtraTreeRegressor Codec for saving the Isolation Forest base estimator to memory/file. + """ + @classmethod + def encode(cls, obj): + import sklearn.tree + assert type(obj) == sklearn.tree.tree.ExtraTreeRegressor + state = obj.__getstate__() + return { + '__mlspl_type': [type(obj).__module__, type(obj).__name__], + 'state': state + } + + @classmethod + def decode(cls,obj): + from sklearn.tree.tree import ExtraTreeRegressor + state = obj['state'] + t = ExtraTreeRegressor.__new__(ExtraTreeRegressor) + t.__setstate__(state) + return t \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LatentDirichletAllocation.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LatentDirichletAllocation.py new file mode 100644 index 00000000..57a15c73 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LatentDirichletAllocation.py @@ -0,0 +1,35 @@ +''' +Once newer version of sklearn is used will need to change k alias from n_topics to n_components +https://stackoverflow.com/a/48121678 +''' + +from sklearn.decomposition import LatentDirichletAllocation as _LatentDirichletAllocation +from base import BaseAlgo, TransformerMixin +from codec import codecs_manager +from util.param_util import convert_params + +class LatentDirichletAllocation(TransformerMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + out_params = convert_params( + options.get('params', {}), + floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'], + strs=['learning_method'], + ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'], + aliases={'k': 'n_topics'} + ) + + self.estimator = _LatentDirichletAllocation(**out_params) + + def rename_output(self, default_names, new_names): + if new_names is None: + new_names = 'LDA' + output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] + return output_names + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.LatentDirichletAllocation', 'LatentDirichletAllocation', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.decomposition.online_lda', 'LatentDirichletAllocation', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LinearSVC.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LinearSVC.py new file mode 100644 index 00000000..4718d24a --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/LinearSVC.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +from sklearn.svm import LinearSVC as _LinearSVC + +from codec import codecs_manager +from base import BaseAlgo, ClassifierMixin +from util.param_util import convert_params + + +class LinearSVC(ClassifierMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + + out_params = convert_params( + options.get('params', {}), + floats=['gamma', 'C', 'tol', 'intercept_scaling'], + ints=['random_state','max_iter'], + strs=['penalty', 'loss', 'multi_class'], + bools=['dual', 'fit_intercept'], + ) + + self.estimator = _LinearSVC(**out_params) + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.LinearSVC', 'LinearSVC', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.svm.classes', 'LinearSVC', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MDS.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MDS.py new file mode 100644 index 00000000..43ea86d1 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MDS.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +from sklearn.manifold import MDS as _MDS + +from base import BaseAlgo, TransformerMixin +from codec import codecs_manager +from util.param_util import convert_params + +from util import df_util + +class MDS(TransformerMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + out_params = convert_params( + options.get('params', {}), + ints=['k', 'max_iter', 'n_init', 'n_jobs'], + floats=['eps'], + bools=['metric'], + aliases={'k': 'n_components'} + ) + + if 'max_iter' not in out_params: + out_params.setdefault('max_iter', 300) + + if 'n_init' not in out_params: + out_params.setdefault('n_init', 4) + + if 'n_jobs' not in out_params: + out_params.setdefault('n_jobs', 1) + + if 'eps' not in out_params: + out_params.setdefault('eps', 0.001) + + if 'metric' not in out_params: + out_params.setdefault('metric', True) + + self.estimator = _MDS(**out_params) + + def rename_output(self, default_names, new_names): + if new_names is None: + new_names = 'MDS' + output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] + return output_names + + def apply(self, df, options): + # Make a copy of data, to not alter original dataframe + X = df.copy() + + # Prepare the features + X, nans, _ = df_util.prepare_features( + X=X, + variables=self.feature_variables, + final_columns=self.columns, + ) + + # Call the transform method + y_hat = self.estimator.fit_transform(X.values) + + # Assign output_name + output_name = options.get('output_name', None) + default_names = self.make_output_names( + output_name=output_name, + n_names=y_hat.shape[1], + ) + output_names = self.rename_output(default_names, output_name) + + # Create output dataframe + output = df_util.create_output_dataframe( + y_hat=y_hat, + nans=nans, + output_names=output_names, + ) + + # Merge with original dataframe + output = df_util.merge_predictions(df, output) + return output + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.MDS', 'MDS', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.manifold.MDS', 'MDS', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MinMaxScaler.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MinMaxScaler.py new file mode 100644 index 00000000..86436af2 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/MinMaxScaler.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import pandas as pd +from sklearn.preprocessing import MinMaxScaler as _MinMaxScaler + +from base import BaseAlgo, TransformerMixin +from codec import codecs_manager +from util.param_util import convert_params +from util import df_util + + +class MinMaxScaler(TransformerMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + + out_params = convert_params( + options.get('params', {}), + bools=['copy'], + strs=['feature_range'] + ) + self.estimator = _MinMaxScaler(**out_params) + self.columns = None + + def rename_output(self, default_names, new_names=None): + if new_names is None: + new_names = 'MMS' + output_names = [new_names + '_' + feature for feature in self.columns] + return output_names + + def partial_fit(self, df, options): + # Make a copy of data, to not alter original dataframe + X = df.copy() + + X, _, columns = df_util.prepare_features( + X=X, + variables=self.feature_variables, + mlspl_limits=options.get('mlspl_limits'), + ) + if self.columns is not None: + df_util.handle_new_categorical_values(X, None, options, self.columns) + if X.empty: + return + else: + self.columns = columns + self.estimator.partial_fit(X) + + def summary(self, options): + if len(options) != 2: # only model name and mlspl_limits + raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) + return pd.DataFrame({'fields': self.columns, + 'mean': self.estimator.mean_, + 'var': self.estimator.var_, + 'scale': self.estimator.scale_}) + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.MinMaxScaler', 'MinMaxScaler', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.preprocessing.data', 'MinMaxScaler', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/NMF.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/NMF.py new file mode 100644 index 00000000..dddeb924 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/NMF.py @@ -0,0 +1,31 @@ +from sklearn.decomposition import NMF as _NMF +from base import BaseAlgo, TransformerMixin +from codec import codecs_manager +from util.param_util import convert_params + +class NMF(TransformerMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + out_params = convert_params( + options.get('params', {}), + floats=['beta_loss','tol','alpha','l1_ratio'], + strs=['init','solver'], + ints=['k','max_iter','random_state'], + bools=['versbose','shuffle'], + aliases={'k': 'n_components'} + ) + + self.estimator = _NMF(**out_params) + + def rename_output(self, default_names, new_names): + if new_names is None: + new_names = 'NMF' + output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] + return output_names + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.NMF', 'NMF', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.decomposition.nmf', 'NMF', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/OrthogonalMatchingPursuit.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/OrthogonalMatchingPursuit.py new file mode 100644 index 00000000..9dd378e9 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/OrthogonalMatchingPursuit.py @@ -0,0 +1,38 @@ +import pandas as pd +from sklearn.linear_model import OrthogonalMatchingPursuit as _OrthogonalMatchingPursuit +from base import RegressorMixin, BaseAlgo +from util.param_util import convert_params +from util import df_util + + +class OrthogonalMatchingPursuit(RegressorMixin, BaseAlgo): + def __init__(self, options): + self.handle_options(options) + + params = options.get('params', {}) + out_params = convert_params( + params, + floats=['tol'], + strs=['kernel'], + ints=['n_nonzero_coefs'], + bools=['fit_intercept', 'normalize'], + ) + + self.estimator = _OrthogonalMatchingPursuit(**out_params) + + def summary(self, options): + if len(options) != 2: # only model name and mlspl_limits + raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) + df = pd.DataFrame({'feature': self.columns, + 'coefficient': self.estimator.coef_.ravel()}) + idf = pd.DataFrame({'feature': ['_intercept'], + 'coefficient': [self.estimator.intercept_]}) + return pd.concat([df, idf]) + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + from codec import codecs_manager + codecs_manager.add_codec('algos_contrib.OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuit', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.linear_model.omp', 'OrthogonalMatchingPursuit', SimpleObjectCodec) + diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SVR.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SVR.py new file mode 100644 index 00000000..63d1f241 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SVR.py @@ -0,0 +1,27 @@ +from sklearn.svm import SVR as _SVR + +from base import BaseAlgo, RegressorMixin +from util.param_util import convert_params + + +class SVR(RegressorMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + + params = options.get('params', {}) + out_params = convert_params( + params, + floats=['C', 'gamma'], + strs=['kernel'], + ints=['degree'], + ) + + self.estimator = _SVR(**out_params) + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + from codec import codecs_manager + codecs_manager.add_codec('algos_contrib.SVR', 'SVR', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.svm.classes', 'SVR', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SavgolFilter.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SavgolFilter.py new file mode 100644 index 00000000..80b28e92 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/SavgolFilter.py @@ -0,0 +1,48 @@ +import numpy as np +from scipy.signal import savgol_filter + +from base import BaseAlgo +from util.param_util import convert_params +from util import df_util + + +class SavgolFilter(BaseAlgo): + + def __init__(self, options): + # set parameters + params = options.get('params', {}) + out_params = convert_params( + params, + ints=['window_length', 'polyorder', 'deriv'] + ) + + # set defaults for parameters + if 'window_length' in out_params: + self.window_length = out_params['window_length'] + else: + self.window_length = 5 + + if 'polyorder' in out_params: + self.polyorder = out_params['polyorder'] + else: + self.polyorder = 2 + + if 'deriv' in out_params: + self.deriv = out_params['deriv'] + else: + self.deriv = 0 + + def fit(self, df, options): + X = df.copy() + X, nans, columns = df_util.prepare_features(X, self.feature_variables) + + def f(x): + return savgol_filter(x, self.window_length, self.polyorder, self.deriv) + + y_hat = np.apply_along_axis(f, 0, X) + + names = ['SG_%s' % col for col in columns] + output_df = df_util.create_output_dataframe(y_hat, nans, names) + df = df_util.merge_predictions(df, output_df) + + return df \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TFBinary.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TFBinary.py new file mode 100644 index 00000000..08317d35 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TFBinary.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +''' +Copy of existing TFIDF algo but with 2 boolean options added and 3 options set +so that binary output is achieved. +''' + +from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer + +from base import BaseAlgo +from codec import codecs_manager +from util import df_util +from util.param_util import convert_params + + +class TFBinary(BaseAlgo): + + def handle_options(self, options): + if len(options.get('feature_variables', [])) != 1 or len(options.get('target_variable', [])) > 0: + raise RuntimeError('Syntax error: You must specify exactly one field') + + def __init__(self, options): + self.handle_options(options) + + out_params = convert_params( + options.get('params', {}), + ints=['max_features'], + bools=['use_idf','binary'], + strs=['max_df', 'min_df', + 'ngram_range', 'stop_words', + 'analyzer', 'norm', 'token_pattern'], + ) + + for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]: + if doc_freq in out_params: + # EAFP... convert max_df/min_df to float/int if it is a number. + try: + float_val = float(out_params[doc_freq]) + int_val = int(float_val) + except ValueError: + raise RuntimeError('Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(doc_freq=doc_freq)) + if float_val == 1.0: + out_params[doc_freq] = default_val + elif float_val == int_val: + out_params[doc_freq] = int_val + else: + out_params[doc_freq] = float_val + + if 'ngram_range' in out_params.keys(): + try: + out_params['ngram_range'] = tuple(int(i) for i in out_params['ngram_range'].split('-')) + assert len(out_params['ngram_range']) == 2 + except: + raise RuntimeError('Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5') + + # TODO: Maybe let the user know that we make this change. + out_params.setdefault('max_features', 100) + + # Binary defaults + out_params.setdefault('use_idf', False) + out_params.setdefault('norm', None) + out_params.setdefault('binary', True) + + self.estimator = _TfidfVectorizer(**out_params) + + def fit(self, df, options): + # Make a copy of data, to not alter original dataframe + X = df.copy() + + # Make sure to turn off get_dummies + X, _, self.columns = df_util.prepare_features( + X=X, + variables=self.feature_variables, + get_dummies=False, + mlspl_limits=options.get('mlspl_limits'), + ) + + X = X.values.ravel().astype('str') + self.estimator.fit(X) + + def make_output_names(self, options): + default_name = self.feature_variables[0] + '_tfbin' + output_name = options.get('output_name', default_name) + feature_names = self.estimator.get_feature_names() + output_names = [output_name + '_' + str(index) + '_' + word + for (index, word) in enumerate(feature_names)] + return output_names + + def apply(self, df, options): + # Make a copy of data, to not alter original dataframe + X = df.copy() + + # Make sure to turn off get_dummies + X, nans, _ = df_util.prepare_features( + X=X, + variables=self.feature_variables, + final_columns=self.columns, + get_dummies=False, + mlspl_limits=options.get('mlspl_limits'), + ) + + X = X.values.ravel().astype('str') + y_hat = self.estimator.transform(X) + + # Convert the returned sparse matrix into array + y_hat = y_hat.toarray() + + output_names = self.make_output_names(options) + + output = df_util.create_output_dataframe( + y_hat=y_hat, + output_names=output_names, + nans=nans, + ) + + df = df_util.merge_predictions(df, output) + return df + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.TFBinary', 'TFBinary', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec) + codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TSNE.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TSNE.py new file mode 100644 index 00000000..ef32ebd2 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TSNE.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +from sklearn.manifold import TSNE as _TSNE + +from base import BaseAlgo, TransformerMixin +from codec import codecs_manager +from util.param_util import convert_params + +from util import df_util + +class TSNE(TransformerMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + out_params = convert_params( + options.get('params', {}), + ints=['k', 'n_iter'], + floats=['perplexity', 'early_exaggeration', 'learning_rate'], + aliases={'k': 'n_components'} + ) + + if out_params['n_components'] < 1: + msg = 'Invalid value for k: k must be greater than or equal to 1, but found k="{}".' + raise RuntimeError(msg.format(out_params['n_components'])) + + if 'n_iter' not in out_params: + out_params.setdefault('n_iter', 200) + + if 'perplexity' not in out_params: + out_params.setdefault('perplexity', 30.0) + + if 'early_exaggeration' not in out_params: + out_params.setdefault('early_exaggeration', 4.0) + + if 'learning_rate' not in out_params: + out_params.setdefault('learning_rate', 100) + + self.estimator = _TSNE(**out_params) + + def rename_output(self, default_names, new_names): + if new_names is None: + new_names = 'TSNE' + output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] + return output_names + + def apply(self, df, options): + # Make a copy of data, to not alter original dataframe + X = df.copy() + + # Prepare the features + X, nans, _ = df_util.prepare_features( + X=X, + variables=self.feature_variables, + final_columns=self.columns, + ) + + # Call the transform method + y_hat = self.estimator.fit_transform(X.values) + + # Assign output_name + output_name = options.get('output_name', None) + default_names = self.make_output_names( + output_name=output_name, + n_names=y_hat.shape[1], + ) + output_names = self.rename_output(default_names, output_name) + + # Create output dataframe + output = df_util.create_output_dataframe( + y_hat=y_hat, + nans=nans, + output_names=output_names, + ) + + # Merge with original dataframe + output = df_util.merge_predictions(df, output) + return output + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.TSNE', 'TSNE', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.manifold.t_sne', 'TSNE', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TruncatedSVD.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TruncatedSVD.py new file mode 100644 index 00000000..6f592146 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/TruncatedSVD.py @@ -0,0 +1,30 @@ +from sklearn.decomposition import TruncatedSVD as _TruncatedSVD +from base import BaseAlgo, TransformerMixin +from codec import codecs_manager +from util.param_util import convert_params + +class TruncatedSVD(TransformerMixin, BaseAlgo): + + def __init__(self, options): + self.handle_options(options) + out_params = convert_params( + options.get('params', {}), + floats=['tol'], + strs=['algorithm'], + ints=['k','n_iter','random_state'], + aliases={'k': 'n_components'} + ) + + self.estimator = _TruncatedSVD(**out_params) + + def rename_output(self, default_names, new_names): + if new_names is None: + new_names = 'SVD' + output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] + return output_names + + @staticmethod + def register_codecs(): + from codec.codecs import SimpleObjectCodec + codecs_manager.add_codec('algos_contrib.TruncatedSVD', 'TruncatedSVD', SimpleObjectCodec) + codecs_manager.add_codec('sklearn.decomposition.truncated_svd', 'TruncatedSVD', SimpleObjectCodec) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/__init__.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/__init__.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py new file mode 100644 index 00000000..cb92af53 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py @@ -0,0 +1,23 @@ +import pandas as pd +from algos_contrib.CustomDecisionTreeClassifier import CustomDecisionTreeClassifier +from test.contrib_util import AlgoTestUtils + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'target_variable': ['a'], + 'feature_variables': ['b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(CustomDecisionTreeClassifier, required_methods , input_df, options) + \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_IsolationForest.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_IsolationForest.py new file mode 100644 index 00000000..3e42e2da --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_IsolationForest.py @@ -0,0 +1,26 @@ +from algos_contrib.IsolationForest import IsolationForest +from test.contrib_util import AlgoTestUtils +import pandas as pd + +def test_algo(): + AlgoTestUtils.assert_algo_basic(IsolationForest, serializable=False) + +def test_algo_options(): + input_df = pd.DataFrame({ + 'a': [5.1, 4.9, 4.7, 4.6], + 'b': [3.5, 3.0, 3.1, 3.2], + 'c': [1.4, 1.4, 1.5, 1.6], + 'd': [0.2, 0.2, 0.2, 0.4], + 'e': ['Iris Setosa','Iris Setosa','Iris Versicolor','Iris Virginica'] + }) + options = { + 'target_variables' : [], + 'feature_variables': ['a','b','c','d'], + } + required_methods = ( + '__init__', + 'fit', + 'apply', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(IsolationForest, required_methods=required_methods, input_df=input_df, options=options, serializable=False) \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_agglomerative_clustering.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_agglomerative_clustering.py new file mode 100644 index 00000000..b136a3aa --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_agglomerative_clustering.py @@ -0,0 +1,6 @@ +from algos_contrib.AgglomerativeClustering import AgglomerativeClustering +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(AgglomerativeClustering, serializable=False) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_collaborativefilter.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_collaborativefilter.py new file mode 100644 index 00000000..f5d152cd --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_collaborativefilter.py @@ -0,0 +1,6 @@ +from algos_contrib.CollaborativeFilter import CollaborativeFilter +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(CollaborativeFilter, serializable=False) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_correlation_matrix.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_correlation_matrix.py new file mode 100644 index 00000000..45c6d55e --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_correlation_matrix.py @@ -0,0 +1,6 @@ +from algos_contrib.CorrelationMatrix import CorrelationMatrix +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(CorrelationMatrix, serializable=False) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_example_algo.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_example_algo.py new file mode 100644 index 00000000..fdcda355 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_example_algo.py @@ -0,0 +1,7 @@ +from algos_contrib.ExampleAlgo import ExampleAlgo +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False) + diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_extra_trees_classifier.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_extra_trees_classifier.py new file mode 100644 index 00000000..e959a3ea --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_extra_trees_classifier.py @@ -0,0 +1,26 @@ +import pandas as pd +from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit +from test.contrib_util import AlgoTestUtils + + + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'target_variable': ['a'], + 'feature_variables': ['b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options) \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py new file mode 100644 index 00000000..bce1a10f --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py @@ -0,0 +1,23 @@ +import pandas as pd +from algos_contrib.LatentDirichletAllocation import LatentDirichletAllocation +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'feature_variables': ['b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(LatentDirichletAllocation, required_methods, input_df, options) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_linear_svc.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_linear_svc.py new file mode 100644 index 00000000..99d09d0b --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_linear_svc.py @@ -0,0 +1,26 @@ +import pandas as pd +from algos_contrib.LinearSVC import LinearSVC +from test.contrib_util import AlgoTestUtils + + + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'target_variable': ['a'], + 'feature_variables': ['b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(LinearSVC, required_methods , input_df, options) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_mds.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_mds.py new file mode 100644 index 00000000..16ff5388 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_mds.py @@ -0,0 +1,6 @@ +from algos_contrib.MDS import MDS +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(MDS, serializable=False) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_min_max_scaler.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_min_max_scaler.py new file mode 100644 index 00000000..22f119a2 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_min_max_scaler.py @@ -0,0 +1,23 @@ +import pandas as pd +from algos_contrib.MinMaxScaler import MinMaxScaler +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'feature_variables': ['a', 'b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(MinMaxScaler, required_methods, input_df, options) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_nmf.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_nmf.py new file mode 100644 index 00000000..7dd9c088 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_nmf.py @@ -0,0 +1,23 @@ +import pandas as pd +from algos_contrib.NMF import NMF +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'feature_variables': ['a', 'b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(NMF, required_methods, input_df, options) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py new file mode 100644 index 00000000..e959a3ea --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py @@ -0,0 +1,26 @@ +import pandas as pd +from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit +from test.contrib_util import AlgoTestUtils + + + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'target_variable': ['a'], + 'feature_variables': ['b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options) \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_savgol_filter.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_savgol_filter.py new file mode 100644 index 00000000..131b8095 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_savgol_filter.py @@ -0,0 +1,6 @@ +from algos_contrib.SavgolFilter import SavgolFilter +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(SavgolFilter, serializable=False) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_svr.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_svr.py new file mode 100644 index 00000000..9c10cc22 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_svr.py @@ -0,0 +1,50 @@ +from algos_contrib.SVR import SVR +from test.contrib_util import AlgoTestUtils + +import numpy as np +import pandas as pd + + +def test_algo_basic(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'target_variable': ['a'], + 'feature_variables': ['b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(SVR, required_methods, input_df, options) + + +def test_prediction(): + training_df = pd.DataFrame({ + 'y': [1, 2, 3], + 'x1': [4, 5, 6], + 'x2': [7, 8, 9], + }) + options = { + 'target_variable': ['y'], + 'feature_variables': ['x1', 'x2'], + } + test_df = pd.DataFrame({ + 'x1': [4], + 'x2': [7], + }) + + svr = SVR(options) + svr.feature_variables = options['feature_variables'] + svr.target_variable = options['target_variable'][0] + svr.fit(training_df, options) + output = svr.apply(test_df, options) + np.testing.assert_approx_equal(output['predicted(y)'].values, np.array([1.1])) + diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tf_binary.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tf_binary.py new file mode 100644 index 00000000..d0b86802 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tf_binary.py @@ -0,0 +1,6 @@ +from algos_contrib.TFBinary import TFBinary +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(TFBinary, serializable=False) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_truncated_svd.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_truncated_svd.py new file mode 100644 index 00000000..f8ce05a9 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_truncated_svd.py @@ -0,0 +1,23 @@ +import pandas as pd +from algos_contrib.TruncatedSVD import TruncatedSVD +from test.contrib_util import AlgoTestUtils + + +def test_algo(): + input_df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6], + 'c': ['a', 'b', 'c'], + }) + options = { + 'feature_variables': ['a', 'b', 'c'], + } + required_methods = ( + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'summary', + 'register_codecs', + ) + AlgoTestUtils.assert_algo_basic(TruncatedSVD, required_methods, input_df, options) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tsne.py b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tsne.py new file mode 100644 index 00000000..ed8496fe --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/algos_contrib/tests/test_tsne.py @@ -0,0 +1,38 @@ +import pytest +from algos_contrib.TSNE import TSNE +from test.contrib_util import AlgoTestUtils + +algo_options = {'feature_variables': ['Review']} + + +def test_algo(): + AlgoTestUtils.assert_algo_basic(TSNE, serializable=False) + + +def test_valid_params(): + algo_options['params'] = {'k': '1'} + TSNE_algo = TSNE(algo_options) + assert TSNE_algo.estimator.n_components == 1 + + +def test_invalid_params_k_not_int(): + algo_options['params'] = {'k': '0.1'} + with pytest.raises((RuntimeError, ValueError)) as excinfo: + _ = TSNE(algo_options) + assert excinfo.match('Invalid value for k: must be an int') + + +def test_invalid_params_k_not_valid(): + algo_options['params'] = {'k': '0'} + with pytest.raises((RuntimeError, ValueError)) as excinfo: + _ = TSNE(algo_options) + assert excinfo.match('Invalid value for k: k must be greater than or equal to 1') + + +def test_default_parameter_values(): + algo_options['params'] = {'k': '1'} + TSNE_algo = TSNE(algo_options) + assert TSNE_algo.estimator.n_iter == 200 + assert TSNE_algo.estimator.perplexity == 30.0 + assert TSNE_algo.estimator.early_exaggeration == 4.0 + assert TSNE_algo.estimator.learning_rate == 100 diff --git a/deployment-apps/SA_mltk_contrib_app/bin/link_mltk.py b/deployment-apps/SA_mltk_contrib_app/bin/link_mltk.py new file mode 100644 index 00000000..9b77b70e --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/link_mltk.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +""" Small utility to add the MLTK bin path to the system path. +This makes it easy to import algorithms or utilities from the MLTK.""" +import os +import sys + + +def check_splunk_home(splunk_home): + """ Check SPLUNK_HOME and raise if not set.""" + if not splunk_home: + raise RuntimeError('No $SPLUNK_HOME provided. Please set SPLUNK_HOME.') + + +def get_mltk_bin_path(splunk_home): + """ Create the path to the MLTK bin folder.""" + check_splunk_home(splunk_home) + mltk_path = os.path.join(splunk_home, 'etc', 'apps', 'Splunk_ML_Toolkit', 'bin') + + if not os.path.exists(mltk_path): + raise RuntimeError('MLTK bin folder not found at {}: is MLTK installed?'.format(mltk_path)) + + return mltk_path + + +def add_mltk(): + """ Adds MLTK bin path to sys.path """ + splunk_home = os.environ.get('SPLUNK_HOME', None) + mltk_bin_path = get_mltk_bin_path(splunk_home) + sys.path.insert(0, mltk_bin_path) diff --git a/deployment-apps/SA_mltk_contrib_app/bin/test.py b/deployment-apps/SA_mltk_contrib_app/bin/test.py new file mode 100644 index 00000000..45a6f316 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/test.py @@ -0,0 +1,5 @@ +from link_mltk import add_mltk +add_mltk() + +from test.util import check_signatures + diff --git a/deployment-apps/SA_mltk_contrib_app/bin/test/__init__.py b/deployment-apps/SA_mltk_contrib_app/bin/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/deployment-apps/SA_mltk_contrib_app/bin/test/contrib_util.py b/deployment-apps/SA_mltk_contrib_app/bin/test/contrib_util.py new file mode 100644 index 00000000..ca978c28 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/test/contrib_util.py @@ -0,0 +1,182 @@ +""" Utility methods for use in testing.""" +import ConfigParser +import json +import os +from inspect import getargspec + +import pandas as pd + +from base import BaseAlgo +from codec import MLSPLDecoder, MLSPLEncoder + + +PACKAGE_NAME='algos_contrib' + + +class AlgoTestUtils(object): + """ + Helper methods for testing algorithm implementations + """ + @staticmethod + def assert_method_signature(algo_cls, method_name, args): + """ + Assert the signature of the specified method + + Args: + algo_cls (class): a custom algorithm class to check + method_name (str): the name of the method + args (list): expected arguments to the named method + + Returns: + (bool): True if the method is callable and has the specified arguments, False otherwise. + + Raises: + AssertionError + """ + method = getattr(algo_cls, method_name, None) + assert method, "Method '{}' does not exist".format(method_name) + assert callable(method), "Method '{}' is not callable".format(method_name) + found_args = getargspec(method).args + msg = 'Method {} has signature: {} - but should have {}'.format(method, args, found_args) + assert found_args == args, msg + + @classmethod + def assert_registered(cls, algo_cls): + """ + Assert that the algorithm is registered in the algos.conf configuration file. + + Args: + algo_cls (class): a custom algorithm class to check + + Returns: + (bool): True if the method is registered in algos.conf file. + + Raises: + AssertionError + """ + config = ConfigParser.RawConfigParser() + with cls.get_algos_conf_fp() as f: + config.readfp(f) + algo_name = algo_cls.__name__ + try: + package_name = config.get(algo_name, 'package') + except ConfigParser.NoSectionError: + assert False, "'{}' not registered in algos.conf".format(algo_name) + except ConfigParser.NoOptionError: + assert False, "'{}' must override 'package' option in algos.conf".format(algo_name) + + assert package_name == PACKAGE_NAME, "The package name must be '{}'".format(PACKAGE_NAME) + + @staticmethod + def assert_serializable(algo_cls, input_df, options): + """ + Assert that the model created by the algorithm is serializable. + + Args: + algo_cls (class): a custom algorithm class to check + input_df (pandas Dataframe): input dataframe for the algorithm being tested + options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm + + Returns: + (bool): True if the the model is serializable, False otherwise. + + Raises: + AssertionError + """ + assert hasattr(algo_cls, 'register_codecs') + algo_cls.register_codecs() + + algo_inst = algo_cls(options) + algo_inst.feature_variables = ['b', 'c'] + algo_inst.target_variable = 'a' + algo_inst.fit(input_df.copy(), options) + + encoded = json.dumps(algo_inst, cls=MLSPLEncoder) + decoded = json.loads(encoded, cls=MLSPLDecoder) + + orig_y = algo_inst.apply(input_df.copy(), options) + decoded_y = decoded.apply(input_df.copy(), options) + pd.util.testing.assert_frame_equal(orig_y, decoded_y) + + @classmethod + def assert_base_algo_method_signatures(cls, algo_cls, required_methods=None): + """ + Assert that the signatures of algorithm's methods adhere to the API. + + Args: + algo_cls (class): a custom algorithm class to check. + required_methods (list): list of required method names. + '__init__' and 'fit' are always required, so + they do not need to be included. + + + Returns: + (bool): True if the methods adhere to the API, False otherwise. + + Raises: + AssertionError + """ + method_args_map = { + '__init__': ['self', 'options'], + 'fit': ['self', 'df', 'options'], + 'partial_fit': ['self', 'df', 'options'], + 'apply': ['self', 'df', 'options'], + 'summary': ['self', 'options'], + 'register_codecs': [], + } + + if required_methods is None: + required_methods = [] + + assert issubclass(algo_cls, BaseAlgo), 'Algorithms must inherit from BaseAlgo.' + + required_method_set = set(required_methods) + extra_methods = required_method_set - method_args_map.viewkeys() + assert extra_methods == set(), "'{}' not in BaseAlgo".format(", ".join(extra_methods)) + + # __init__ and fit are always required. + required_method_set.add('__init__') + required_method_set.add('fit') + + for required_method in required_method_set: + cls.assert_method_signature(algo_cls, required_method, method_args_map[required_method]) + + @classmethod + def assert_algo_basic(cls, algo_cls, required_methods=None, input_df=None, options=None, serializable=True): + """ + Assert signatures of methods, registration, and serialization + + Args: + algo_cls (class): a custom algorithm class to check. + input_df (pandas Dataframe): input dataframe for the algorithm being tested + options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm + serializable (bool): whether to check serializability or not. + + Returns: + (bool): True if the methods adhere to the API, False otherwise. + + Raises: + AssertionError + """ + cls.assert_base_algo_method_signatures(algo_cls, required_methods) + cls.assert_registered(algo_cls) + if serializable: + # The input and options are required for serializability test. + assert input_df is not None + assert options is not None + cls.assert_serializable(algo_cls, input_df, options) + + @staticmethod + def get_algos_conf_fp(): + """ + Get a reference (pointer) to algos.conf file open for read + + This method mainly exists to aid testing. + + Returns: + (File): algos.conf file pointer + """ + algos_file_path = os.path.join(os.path.dirname(__file__), '..', '..', 'default', 'algos.conf') + return open(algos_file_path) + + diff --git a/deployment-apps/SA_mltk_contrib_app/bin/test/test_contrib_util.py b/deployment-apps/SA_mltk_contrib_app/bin/test/test_contrib_util.py new file mode 100644 index 00000000..38936aba --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/bin/test/test_contrib_util.py @@ -0,0 +1,152 @@ +import mock +import io +import pandas as pd +import pytest +import sys + +from base import BaseAlgo +from util.base_util import MLSPLNotImplementedError + +from contrib_util import AlgoTestUtils + + +@pytest.fixture +def min_algo_cls(): + class MinimalAlgo(BaseAlgo): + pass + return MinimalAlgo + + +@pytest.fixture +def serializable_algo_cls(): + class SerializableAlgo(BaseAlgo): + def __init__(self, options): + pass + + def fit(self, df, options): + pass + + def apply(self, df, options): + return df + + @classmethod + def register_codecs(cls): + from codec.codecs import SimpleObjectCodec + from codec import codecs_manager + codecs_manager.add_codec('test.test_contrib_util', 'SerializableAlgo', SimpleObjectCodec) + + # Add the class to this module so that encoder and decoder can access it. + # This is only necessary for a fixture function. Normally, these classes will be defined within a module. + setattr(sys.modules[__name__], 'SerializableAlgo', SerializableAlgo) + return SerializableAlgo + + +mock_algo_conf = """ +[MinimalAlgo] +package=algos_contrib +""" + + +mock_algo_conf_no_package = """ +[MinimalAlgo] +""" + + +def test_method_signature(min_algo_cls): + AlgoTestUtils.assert_method_signature(min_algo_cls, 'fit', ['self', 'df', 'options']) + + +@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf)) +def test_registered(mock_get_algos_conf_fp, min_algo_cls): + AlgoTestUtils.assert_registered(min_algo_cls) + + +def test_serializable(serializable_algo_cls): + AlgoTestUtils.assert_serializable(serializable_algo_cls, input_df=pd.DataFrame({}), options={}) + + +def test_base_algo_method_signatures_default_methods(min_algo_cls): + AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls) + + +def test_base_algo_method_signatures_all_methods(min_algo_cls): + AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[ + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'register_codecs', + ]) + + +def test_base_algo_method_signatures_extra_methods(min_algo_cls): + with pytest.raises(AssertionError) as e: + extra_args = [ + 'extra1', + 'extra2', + ] + AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[ + '__init__', + 'fit', + 'partial_fit', + 'apply', + 'register_codecs', + ] + extra_args) + assert e.match('{}.*not in BaseAlgo'.format(extra_args)) + + +@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf)) +def test_algo_basic(mock_get_algos_conf_fp, min_algo_cls): + AlgoTestUtils.assert_algo_basic(min_algo_cls, serializable=False) + + +def test_no_base_algo(): + class NoBaseAlgo(object): + pass + + with pytest.raises(AssertionError) as e: + AlgoTestUtils.assert_base_algo_method_signatures(NoBaseAlgo) + assert e.match('must inherit from BaseAlgo') + + +def test_method_signature_non_existent(min_algo_cls): + bad_method = 'foot' + with pytest.raises(AssertionError) as e: + AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options']) + e.match("{}.*does not exist".format(bad_method)) + + +def test_method_signature_not_callable(min_algo_cls): + bad_method = 'fit' + + # Make fit a property. + min_algo_cls.fit = 'fit' + + with pytest.raises(AssertionError) as e: + AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options']) + e.match("{}.*not callable".format(bad_method)) + + +@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf)) +def test_unregistered(mock_get_algos_conf_fp): + class UnregisteredAlgo(BaseAlgo): + pass + + with pytest.raises(AssertionError) as e: + AlgoTestUtils.assert_registered(UnregisteredAlgo) + assert e.match('{}.*not registered'.format(UnregisteredAlgo.__name__)) + + +@mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf_no_package)) +def test_registered_with_missing_package_option(mock_get_algos_conf_fp, min_algo_cls): + with pytest.raises(AssertionError) as e: + AlgoTestUtils.assert_registered(min_algo_cls) + assert e.match('{}.*must override.*package'.format(min_algo_cls.__name__)) + + +def test_not_serializable(min_algo_cls): + with pytest.raises(MLSPLNotImplementedError) as e: + AlgoTestUtils.assert_serializable(min_algo_cls, input_df=pd.DataFrame({}), options={}) + assert e.match('does not support saving') + + diff --git a/deployment-apps/SA_mltk_contrib_app/default/algos.conf b/deployment-apps/SA_mltk_contrib_app/default/algos.conf new file mode 100644 index 00000000..a52d0c51 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/default/algos.conf @@ -0,0 +1,62 @@ +# Here is where algorithms are registered. +[default] + +######################################################################## +# Due to the layering of configuration files in Splunk, we have to +# override the package name in every section. +######################################################################## + + +[AgglomerativeClustering] +package=algos_contrib + +[CorrelationMatrix] +package=algos_contrib + +[ExampleAlgo] +package=algos_contrib + +[SVR] +package=algos_contrib + +[SavgolFilter] +package=algos_contrib + +[TSNE] +package=algos_contrib + +[MDS] +package=algos_contrib + +[OrthogonalMatchingPursuit] +package=algos_contrib + +[TruncatedSVD] +package=algos_contrib + +[LatentDirichletAllocation] +package=algos_contrib + +[NMF] +package=algos_contrib + +[CollaborativeFilter] +package=algos_contrib + +[CustomDecisionTreeClassifier] +package=algos_contrib + +[TFBinary] +package = algos_contrib + +[MinMaxScaler] +package = algos_contrib + +[LinearSVC] +package = algos_contrib + +[ExtraTreesClassifier] +package = algos_contrib + +[IsolationForest] +package = algos_contrib \ No newline at end of file diff --git a/deployment-apps/SA_mltk_contrib_app/default/app.conf b/deployment-apps/SA_mltk_contrib_app/default/app.conf new file mode 100644 index 00000000..b26b2a9b --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/default/app.conf @@ -0,0 +1,18 @@ +# +# Splunk app configuration file +# + +[install] +is_configured = 0 + +[package] +id = SA_mltk_contrib_app + +[ui] +is_visible = false +label = Splunk MLTK Algorithms on GitHub + +[launcher] +author = Gyanendra Rana +description = An app based on Open Source GitHub repo for Splunk Machine Learning Toolkit Algorithms +version = 1.0 diff --git a/deployment-apps/SA_mltk_contrib_app/default/data/ui/nav/default.xml b/deployment-apps/SA_mltk_contrib_app/default/data/ui/nav/default.xml new file mode 100644 index 00000000..1ab35f75 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/default/data/ui/nav/default.xml @@ -0,0 +1,7 @@ + diff --git a/deployment-apps/SA_mltk_contrib_app/default/data/ui/views/README.md b/deployment-apps/SA_mltk_contrib_app/default/data/ui/views/README.md new file mode 100644 index 00000000..6cf74f0b --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/default/data/ui/views/README.md @@ -0,0 +1 @@ +Add all the views that your app needs in this directory diff --git a/deployment-apps/SA_mltk_contrib_app/metadata/default.meta b/deployment-apps/SA_mltk_contrib_app/metadata/default.meta new file mode 100644 index 00000000..b2f4f507 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/metadata/default.meta @@ -0,0 +1,39 @@ + +# Application-level permissions + +[] +access = read : [ * ], write : [ admin, power ] + +### EVENT TYPES + +[eventtypes] +export = system + + +### PROPS + +[props] +export = system + + +### TRANSFORMS + +[transforms] +export = system + + +### LOOKUPS + +[lookups] +export = system + + +### VIEWSTATES: even normal users should be able to create shared viewstates + +[viewstates] +access = read : [ * ], write : [ * ] +export = system + + +[algos] +export = system diff --git a/deployment-apps/SA_mltk_contrib_app/splunkbase.manifest b/deployment-apps/SA_mltk_contrib_app/splunkbase.manifest new file mode 100644 index 00000000..9973d786 --- /dev/null +++ b/deployment-apps/SA_mltk_contrib_app/splunkbase.manifest @@ -0,0 +1,233 @@ +{ + "version": "1.0", + "date": "2023-04-26T14:53:07.461206713Z", + "hashAlgorithm": "SHA-256", + "app": { + "id": 4403, + "version": "1.0", + "files": [ + { + "path": "LICENSE", + "hash": "c71d239df91726fc519c6eb72d318ec65820627232b2f796219e87dcf35d0ab4" + }, + { + "path": "bin/README.md", + "hash": "597cdad620bec4e52e0e8adc3cad99de9b3ce45da0dd18e4159e1009c976e957" + }, + { + "path": "bin/test/test_contrib_util.py", + "hash": "f521bae6ecd4bf13d969fc9ba8fd8b1948fbd59e9dadaf9a1355f5a549cdbe32" + }, + { + "path": "bin/test/__init__.py", + "hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + }, + { + "path": "bin/test/contrib_util.py", + "hash": "7c24c1ced03aacce92c29095be20d494cba5c0ed1fc7fa167c713c061c151ffd" + }, + { + "path": "bin/algos_contrib/MinMaxScaler.py", + "hash": "59752f166603a4c85c057acd12bb155be5e98e8c89d4c2944fda247daf860802" + }, + { + "path": "bin/algos_contrib/ExampleAlgo.py", + "hash": "1f67435c2ad60172129e45d6d4d7f8d0ce6c47df1603771166e35a3ee96fc5b4" + }, + { + "path": "bin/algos_contrib/CustomDecisionTreeClassifier.py", + "hash": "8ce1dc0b4dcf774ca1d6e327b737d695d8e091bda2664a756f31da6921e52abd" + }, + { + "path": "bin/algos_contrib/CorrelationMatrix.py", + "hash": "9ab4f8070c695a744c3a1219b27bf265c7044756a9cfdcb507e40a0a7861213a" + }, + { + "path": "bin/algos_contrib/TSNE.py", + "hash": "7c0f772ca89df4480939285ee33857d5eadcad8bb73d80ae834835dad98439b7" + }, + { + "path": "bin/algos_contrib/ExtraTreesClassifier.py", + "hash": "c9c085a39267c0fd8e32ac3830501fc095b431ded57ba7ddc79d05c6576f756c" + }, + { + "path": "bin/algos_contrib/TruncatedSVD.py", + "hash": "d16e383f59b216aae3fd6725632a431c4c54c1e0b2a4f4a6ac203f518d6bca39" + }, + { + "path": "bin/algos_contrib/AgglomerativeClustering.py", + "hash": "d121cb6ff52f06975777c2f155a9f6d3e58c6aa3c1156bdca3ed65d7e33f22ef" + }, + { + "path": "bin/algos_contrib/SVR.py", + "hash": "385c74c4cefdbbb972de4fb9eff78cde879133596285653dad8f3eb00f3840e2" + }, + { + "path": "bin/algos_contrib/SavgolFilter.py", + "hash": "9b174720370b5425f4fec2364e02e9dd7e1e76bdeedc55a7087b232069d58649" + }, + { + "path": "bin/algos_contrib/CollaborativeFilter.py", + "hash": "94384d011c281796c8091ed12f812b4781cfff7f633fc47e6873489a445685b1" + }, + { + "path": "bin/algos_contrib/TFBinary.py", + "hash": "399b00a32d2a445fd2dee2cc42a0bde976a1d7640417e5ef4ee64ab60b7e917f" + }, + { + "path": "bin/algos_contrib/LatentDirichletAllocation.py", + "hash": "2c3eec09909771fe0868e4e666b536ba3be76034011bdcc2c7b830f9b65d716e" + }, + { + "path": "bin/algos_contrib/__init__.py", + "hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + }, + { + "path": "bin/algos_contrib/LinearSVC.py", + "hash": "bf55507202eb0743d0001ca3ae861cc7bd14a9eaffc2424e983039f6e724c736" + }, + { + "path": "bin/algos_contrib/OrthogonalMatchingPursuit.py", + "hash": "7916076ba3f1bca181d1ac3efbc21e57583376612cf34dd70378c4f0002c91a6" + }, + { + "path": "bin/algos_contrib/MDS.py", + "hash": "c43efcffee0dd3422967e64ccbd8dd5bb7c34da6f2e8a3b3605221251b065b12" + }, + { + "path": "bin/algos_contrib/IsolationForest.py", + "hash": "72fc6e8342f9eaf130570cc2e41a903624226d847ae4259e2cd4983d930c8298" + }, + { + "path": "bin/algos_contrib/NMF.py", + "hash": "62054702577674e0637c94c24026b541b2fa1741ae9dc2a740ab29779550eef2" + }, + { + "path": "bin/algos_contrib/tests/test_linear_svc.py", + "hash": "6a4cabc9a6617f9ee4e9502b71c7532bb7b3adf09ed9164ca63f7f5d98ef76e5" + }, + { + "path": "bin/algos_contrib/tests/test_tsne.py", + "hash": "0f7517b0dbe6f0c373223605fdd7937eedd8d77024dda48cd2af86631cd1c6f8" + }, + { + "path": "bin/algos_contrib/tests/test_svr.py", + "hash": "c59ed4d4217a71408d69003f77feb7ca3160369a4e58570fc34c12269ac934eb" + }, + { + "path": "bin/algos_contrib/tests/test_truncated_svd.py", + "hash": "0f306f7b2395c94c378571ff7040e4907c66f718520c5b59b28a0437fa74a97f" + }, + { + "path": "bin/algos_contrib/tests/test_IsolationForest.py", + "hash": "eff84b1f9ef5802a20e8e928c7c60ebfd6ecc8f4e71cc40aaea9ae540ecb64ad" + }, + { + "path": "bin/algos_contrib/tests/test_min_max_scaler.py", + "hash": "577fdac81ed8e78c2c0b4142d477373f1552ba9c48e04cb3683e4badb6b16e39" + }, + { + "path": "bin/algos_contrib/tests/test_mds.py", + "hash": "8090f4fb85746f3ed42049eb967048fb389bc06d8cc9f08432a948980eceff78" + }, + { + "path": "bin/algos_contrib/tests/test_latent_dirichlet_allocation.py", + "hash": "dfaa22fd3ec482b67bd04b9780eab6f103de4b2255f4ebd272fd42a28ee3468d" + }, + { + "path": "bin/algos_contrib/tests/test_collaborativefilter.py", + "hash": "9e4b5115862ec45f3c6fcf77db75c8603fd7920d990b512d17b3f9b013df1668" + }, + { + "path": "bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py", + "hash": "96e67dea269fcaf38005a34de4baee6205007d905d4bdfe5b2750f3da79e44c9" + }, + { + "path": "bin/algos_contrib/tests/test_correlation_matrix.py", + "hash": "a47a1ca416ed9553aaa81d749198c2d578c190704168b0c2fc8cd7c81c196119" + }, + { + "path": "bin/algos_contrib/tests/__init__.py", + "hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + }, + { + "path": "bin/algos_contrib/tests/test_example_algo.py", + "hash": "e4f9509b4a9c9d30cfc8ab5faf5424de04b13713d5be1d1c4d82aa988426ca59" + }, + { + "path": "bin/algos_contrib/tests/test_extra_trees_classifier.py", + "hash": "7aa79278f98010183abb1be6c937c456bc611f9233aa9199be0030533e8f9ee9" + }, + { + "path": "bin/algos_contrib/tests/test_savgol_filter.py", + "hash": "586e613e41b819b67e239ed316aed90e6a7b9602e3f31d8e41da9b819846013d" + }, + { + "path": "bin/algos_contrib/tests/test_agglomerative_clustering.py", + "hash": "5afa871420153c1f0e82cde120d675b89a7048b03a9b6cf17de20316d3f3dc3d" + }, + { + "path": "bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py", + "hash": "7aa79278f98010183abb1be6c937c456bc611f9233aa9199be0030533e8f9ee9" + }, + { + "path": "bin/algos_contrib/tests/test_tf_binary.py", + "hash": "48761919a278357b5ef9a41a0fe79d80a139a9bf6531e27c69dbf01368cadc2d" + }, + { + "path": "bin/algos_contrib/tests/test_nmf.py", + "hash": "246c86e04b3875907c4f70669efd05df3eebad8016b0dd24019c74e12f10ed08" + }, + { + "path": "bin/test.py", + "hash": "cacc66edf525a77a5cb451770360701533ef454391a96be6044179f7df1ca9d1" + }, + { + "path": "bin/link_mltk.py", + "hash": "d94b783b59e249590eacfed4a01edb723f7671c51cef7c6730ba8c41c0e7fa7e" + }, + { + "path": "default/algos.conf", + "hash": "d8a2c63b1406b31f5c98c0bced406895ecd02104455b098fcbf8672c459751f0" + }, + { + "path": "default/data/ui/views/README.md", + "hash": "4ccd9dc2dca5bd634f7c07ad1749e4e63a7969c84e2eff83517256f7c884cd29" + }, + { + "path": "default/data/ui/nav/default.xml", + "hash": "e5e0678bca27efa4ded83f8f83a7f2ef10291a4d66fa43ac4f95ce735fb3e824" + }, + { + "path": "default/app.conf", + "hash": "9ca504d6baa4020f4583d9a950bf259bd706882243483dff2834e787fc376174" + }, + { + "path": "metadata/default.meta", + "hash": "721109ec9f1724ee76ce3d9a4ef68ab1f27dac40ee213f147744e3028b5090ac" + } + ] + }, + "products": [ + { + "platform": "splunk", + "product": "enterprise", + "versions": [ + "7.0", + "7.1", + "7.2" + ], + "architectures": [ + "x86_64" + ], + "operatingSystems": [ + "windows", + "linux", + "macos", + "freebsd", + "solaris", + "aix" + ] + } + ] +} \ No newline at end of file