#!/usr/bin/env python import pandas as pd from sklearn.naive_bayes import GaussianNB as _GaussianNB import cexc from base import BaseAlgo, ClassifierMixin from codec import codecs_manager from util import df_util class GaussianNB(ClassifierMixin, BaseAlgo): def __init__(self, options): self.handle_options(options) self.estimator = _GaussianNB() def summary(self, options): """Only model_name and mlspl_limits are supported for summary""" if len(options) != 2: msg = '"%s" models do not take options for summarization' % self.__class__.__name__ raise RuntimeError(msg) classes = self.estimator.classes_ # DataFrame to include class scores df = pd.DataFrame( { 'class': classes, 'class_count': self.estimator.class_count_.astype(int), 'class_prior': self.estimator.class_prior_.round(3), }, index=classes, ) # renaming column names for display of feature_log_probability feature_scores = ['variance({})'.format(feature) for feature in self.feature_variables] feature_variance = self.estimator.var_.round(3) # The default behaviour when encountering categorical fields is to one-hot-encode, # causing the number of feature columns to grow by the number of unique labels in the field. # currently we do not support feature-summary information for such cases. if len(feature_scores) == feature_variance.shape[1]: # Create dataFrame to include feature probability scores per class df_feature_score = pd.DataFrame( columns=feature_scores, data=feature_variance, index=classes ) df = df_util.merge_predictions(df, df_feature_score) else: # Raise a warning to inform the user about depreciated columns, with basic class scores. cexc.messages.warn( "Unable to display variance information for features when categorical fields are supplied." ) return df @staticmethod def register_codecs(): from codec.codecs import SimpleObjectCodec codecs_manager.add_codec('algos.GaussianNB', 'GaussianNB', SimpleObjectCodec) codecs_manager.add_codec('sklearn.naive_bayes', 'GaussianNB', SimpleObjectCodec)