#!/usr/bin/env python import pandas as pd from sklearn.preprocessing import StandardScaler as _StandardScaler from base import BaseAlgo, TransformerMixin from codec import codecs_manager from util.param_util import convert_params from util import df_util class StandardScaler(TransformerMixin, BaseAlgo): def __init__(self, options): self.handle_options(options) out_params = convert_params(options.get('params', {}), bools=['with_mean', 'with_std']) self.estimator = _StandardScaler(**out_params) self.columns = None def rename_output(self, default_names, new_names=None): if new_names is None: new_names = 'SS' output_names = [new_names + '_' + feature for feature in self.columns] return output_names def partial_fit(self, df, options): # Make a copy of data, to not alter original dataframe X = df.copy() X, _, columns = df_util.prepare_features( X=X, variables=self.feature_variables, mlspl_limits=options.get('mlspl_limits') ) if self.columns is not None: df_util.handle_new_categorical_values(X, None, options, self.columns) if X.empty: return else: self.columns = columns self.estimator.partial_fit(X) def summary(self, options): if len(options) != 2: # only model name and mlspl_limits raise RuntimeError( '"%s" models do not take options for summarization' % self.__class__.__name__ ) return pd.DataFrame( { 'fields': self.columns, 'mean': self.estimator.mean_, 'var': self.estimator.var_, 'scale': self.estimator.scale_, } ) @staticmethod def register_codecs(): from codec.codecs import SimpleObjectCodec codecs_manager.add_codec('algos.StandardScaler', 'StandardScaler', SimpleObjectCodec) codecs_manager.add_codec( 'sklearn.preprocessing._data', 'StandardScaler', SimpleObjectCodec )