You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.8 KiB

#!/usr/bin/env python
from sklearn.decomposition import KernelPCA as _KPCA
from .PCA import PCA as PCAAlgo
from util.param_util import convert_params
from codec import codecs_manager
from util import df_util
class KernelPCA(PCAAlgo):
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
ints=['k', 'degree', 'alpha', 'max_iteration'],
floats=['gamma', 'tolerance'],
aliases={'k': 'n_components', 'tolerance': 'tol', 'max_iteration': 'max_iter'},
)
out_params['kernel'] = 'rbf'
if 'n_components' not in out_params:
out_params['n_components'] = min(2, len(options['feature_variables']))
elif out_params['n_components'] == 0:
raise RuntimeError('k needs to be greater than zero.')
self.estimator = _KPCA(**out_params)
# sklearn's KernelPCA.transform tries to form a complete kernel
# matrix of its input and the original data the model was fit
# on. Unfortunately, this might consume a colossal amount of
# memory for large inputs. We chunk the input to cut down on this.
def apply(self, df, options=None):
# Handle backwards compatibility.
self.estimator.n_jobs = 1
if options is not None:
func = super(self.__class__, self).apply
return df_util.apply_in_chunks(df, func, 1000, options)
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec
codecs_manager.add_codec('algos.KernelPCA', 'KernelPCA', SimpleObjectCodec)
codecs_manager.add_codec(
'sklearn.preprocessing._data', 'KernelCenterer', SimpleObjectCodec
)
codecs_manager.add_codec(
'sklearn.decomposition._kernel_pca', 'KernelPCA', SimpleObjectCodec
)