You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
2.8 KiB
69 lines
2.8 KiB
#!/usr/bin/env python
|
|
|
|
from sklearn.decomposition import PCA as _PCA
|
|
import pandas as pd
|
|
|
|
from base import BaseAlgo, TransformerMixin
|
|
from codec import codecs_manager
|
|
from util.param_util import convert_params
|
|
|
|
|
|
class PCA(TransformerMixin, BaseAlgo):
|
|
def __init__(self, options):
|
|
self.handle_options(options)
|
|
out_params = convert_params(
|
|
options.get('params', {}),
|
|
ints=['k'],
|
|
floats=['variance'],
|
|
aliases={'k': 'n_components'},
|
|
)
|
|
if 'variance' in out_params:
|
|
if 'n_components' in out_params:
|
|
msg = "Only one of k = {} or variance={} should be provided. Both cannot be respected.".format(
|
|
out_params['n_components'], out_params['variance']
|
|
)
|
|
raise RuntimeError(msg)
|
|
elif out_params['variance'] <= 0 or out_params['variance'] > 1:
|
|
msg = "Valid value for variance is 0 < variance <= 1"
|
|
raise RuntimeError(msg)
|
|
else:
|
|
# If we are doing PCA based on variance_ratio_explained, based on scikit-learn implementation,
|
|
# we set the n_components to that percentage, which will select the number of components such
|
|
# that the amount of variance that needs to be explained is greater than the percentage
|
|
# specified by n_components.
|
|
|
|
if 0 < out_params['variance'] < 1:
|
|
out_params['n_components'] = out_params['variance']
|
|
del out_params['variance']
|
|
|
|
self.estimator = _PCA(**out_params)
|
|
|
|
def rename_output(self, default_names, new_names):
|
|
if new_names is None:
|
|
new_names = 'PC'
|
|
output_names = ['{}_{}'.format(new_names, i + 1) for i in range(len(default_names))]
|
|
return output_names
|
|
|
|
def summary(self, options):
|
|
"""Only model_name and mlspl_limits are supported for summary"""
|
|
if len(options) != 2:
|
|
msg = '"%s" models do not take options for summarization' % self.__class__.__name__
|
|
raise RuntimeError(msg)
|
|
n_components = ['PC_{}'.format(i + 1) for i in range(self.estimator.n_components_)]
|
|
return pd.DataFrame(
|
|
{
|
|
'components': n_components,
|
|
'explained_variance': self.estimator.explained_variance_.round(4),
|
|
'explained_variance_ratio': self.estimator.explained_variance_ratio_.round(4),
|
|
'singular_values': self.estimator.singular_values_.round(4),
|
|
},
|
|
index=n_components,
|
|
)
|
|
|
|
@staticmethod
|
|
def register_codecs():
|
|
from codec.codecs import SimpleObjectCodec
|
|
|
|
codecs_manager.add_codec('algos.PCA', 'PCA', SimpleObjectCodec)
|
|
codecs_manager.add_codec('sklearn.decomposition._pca', 'PCA', SimpleObjectCodec)
|