You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
285 lines
9.0 KiB
285 lines
9.0 KiB
#!/usr/bin/env python
|
|
|
|
import json
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from util.param_util import convert_params, is_truthy
|
|
from util.version_util import check_kfold_cv_available
|
|
import base
|
|
from util import df_util
|
|
|
|
|
|
def tree_summary(algo, options=None):
|
|
"""Create summary for tree based models.
|
|
|
|
Args:
|
|
algo (object): an algo object
|
|
options (dict): options
|
|
|
|
Returns:
|
|
(dataframe): dataframe representation of the tree summary
|
|
"""
|
|
if options:
|
|
out_params = convert_params(options.get('params', {}), ints=['limit'], bools=['json'])
|
|
if 'json' in out_params:
|
|
return_json = out_params['json']
|
|
if 'limit' in out_params:
|
|
depth_limit = out_params['limit']
|
|
|
|
mlspl_limits = options.get('mlspl_limits', {})
|
|
if 'return_json' not in locals():
|
|
return_json = is_truthy(mlspl_limits.get('summary_return_json', 'f'))
|
|
if 'depth_limit' not in locals():
|
|
depth_limit = int(mlspl_limits.get('summary_depth_limit', 5))
|
|
|
|
if depth_limit <= 0:
|
|
raise ValueError('Limit = %d. Value for limit should be greater than 0.' % depth_limit)
|
|
|
|
root = 0
|
|
depth = 0
|
|
if return_json:
|
|
output_data = [
|
|
json.dumps(tree_summary_dict(algo, depth_limit, root, depth), sort_keys=True)
|
|
]
|
|
else:
|
|
output_data = tree_summary_str(algo, depth_limit, root, depth)
|
|
return pd.DataFrame({'Decision Tree Summary': output_data})
|
|
|
|
|
|
def tree_summary_str(algo, depth_limit, root, depth):
|
|
"""Recursively go down a tree/subtree and render splits as strings.
|
|
|
|
Args:
|
|
algo (object): algo object
|
|
depth_limit (int): depth limit of a tree for summary representation
|
|
root (int): tree id
|
|
depth (int): depth
|
|
|
|
Returns:
|
|
output (list): tree splits
|
|
"""
|
|
t = algo.estimator.tree_
|
|
features = algo.columns
|
|
|
|
left_child = t.children_left[root]
|
|
right_child = t.children_right[root]
|
|
|
|
n_nodes = t.n_node_samples[root]
|
|
impurity = t.impurity[root]
|
|
|
|
if isinstance(algo, base.ClassifierMixin):
|
|
classes = algo.estimator.classes_
|
|
value = t.value[root][0]
|
|
class_value = classes[value.argmax()]
|
|
value_str = "class:%s " % class_value
|
|
else:
|
|
value_str = "value:%.3f " % t.value[root][0][0]
|
|
|
|
indent = '----' * depth + ' '
|
|
|
|
if left_child > 0 or right_child > 0:
|
|
feature = features[t.feature[root]]
|
|
if feature in algo.feature_variables:
|
|
feature_val = t.threshold[root]
|
|
split_str = "split:%s<=%.3f" % (feature, feature_val)
|
|
else:
|
|
split_str = "split:%s" % feature
|
|
else:
|
|
split_str = "split:N/A - Leaf node"
|
|
output_str = (
|
|
"|--"
|
|
+ indent
|
|
+ "count:%d %s %simpurity:%.3f" % (n_nodes, split_str, value_str, impurity)
|
|
)
|
|
output = [output_str]
|
|
|
|
if depth_limit >= 1:
|
|
depth += 1
|
|
depth_limit -= 1
|
|
if left_child > 0:
|
|
output.extend(tree_summary_str(algo, depth_limit, left_child, depth))
|
|
if right_child > 0:
|
|
output.extend(tree_summary_str(algo, depth_limit, right_child, depth))
|
|
return output
|
|
|
|
|
|
def tree_summary_dict(algo, depth_limit, root, depth):
|
|
"""Recursively go down a tree/subtree and render splits as dictionaries.
|
|
|
|
Args:
|
|
algo (object): algo object
|
|
depth_limit (int): depth limit of a tree
|
|
root (int): root of the tree/subtree
|
|
depth (int): depth of the tree/subtree
|
|
|
|
Return:
|
|
output (dict): tree splits
|
|
"""
|
|
t = algo.estimator.tree_
|
|
features = algo.columns
|
|
|
|
left_child = t.children_left[root]
|
|
right_child = t.children_right[root]
|
|
|
|
output = {}
|
|
output["count"] = int(t.n_node_samples[root])
|
|
|
|
if isinstance(algo, base.ClassifierMixin):
|
|
classes = algo.estimator.classes_
|
|
value = t.value[root][0]
|
|
output["class"] = classes[value.argmax()]
|
|
else:
|
|
output["value"] = round(t.value[root][0][0], 3)
|
|
|
|
if left_child > 0 or right_child > 0:
|
|
feature = features[t.feature[root]]
|
|
if feature in algo.feature_variables:
|
|
feature_val = t.threshold[root]
|
|
output["split"] = "%s<=%.3f" % (feature, feature_val)
|
|
else:
|
|
output["split"] = "split:%s" % feature
|
|
else:
|
|
output["split"] = "split:N/A - Leaf node"
|
|
|
|
output["impurity"] = round(t.impurity[root], 3)
|
|
|
|
if depth_limit >= 1:
|
|
depth += 1
|
|
depth_limit -= 1
|
|
if left_child > 0:
|
|
output["left child"] = tree_summary_dict(algo, depth_limit, left_child, depth)
|
|
if right_child > 0:
|
|
output["right child"] = tree_summary_dict(algo, depth_limit, right_child, depth)
|
|
return output
|
|
|
|
|
|
def assert_estimator_supports_partial_fit(estimator):
|
|
"""Assert the estimator has a partial_fit method, otherwise raise error.
|
|
|
|
Args:
|
|
estimator (object): a scikit-learn estimator
|
|
|
|
Raises:
|
|
RuntimeError
|
|
"""
|
|
if not hasattr(estimator, 'partial_fit'):
|
|
text = 'Algorithm {} does not support partial fit'
|
|
msg = text.format(estimator.__class__.__name__)
|
|
raise RuntimeError(msg)
|
|
|
|
|
|
def confidence_interval_to_alpha(x):
|
|
"""Transform confidence interval to alpha."""
|
|
if x >= 100 or x <= 0:
|
|
raise RuntimeError('conf_interval cannot be less than 0 or more than 100.')
|
|
return 1 - x / 100.0
|
|
|
|
|
|
def alpha_to_confidence_interval(x):
|
|
"""Transform alpha to confidence interval."""
|
|
return int(round((1 - x) * 100))
|
|
|
|
|
|
def handle_max_features(max_features):
|
|
"""Deal with the multiple types of max_features and error accordingly
|
|
|
|
Args:
|
|
max_features (string): the value of the max_features paramter
|
|
Returns:
|
|
max_features: it could be a float, an int, a string, or None
|
|
"""
|
|
if max_features.lower() == "none":
|
|
max_features = None
|
|
else:
|
|
# EAFP... convert max_features to int if it is a number.
|
|
try:
|
|
max_features = float(max_features)
|
|
max_features_int = int(max_features)
|
|
if max_features == max_features_int:
|
|
max_features = max_features_int
|
|
except:
|
|
pass
|
|
return max_features
|
|
|
|
|
|
def add_missing_attr(estimator, attr, value, param_key=None):
|
|
"""Set attributes on the estimator.
|
|
|
|
Between versions of scikit-learn, estimators may be missing certain attributes.
|
|
Sometimes those attributes are simply renamed, other times they do not already
|
|
exist. This method is just a utility to set those on the estimator.
|
|
|
|
Args:
|
|
estimator (obj): the estimator object
|
|
attr (str): the attribute to add
|
|
value (str, float, int, or str): the value to set if param_key is not used
|
|
param_key (str): the name of an existing param (from get_params) to use. If
|
|
the value is not set, it will default to the value arg.
|
|
"""
|
|
if param_key is not None:
|
|
params = estimator.get_params()
|
|
new_value = params.get(param_key, value)
|
|
else:
|
|
new_value = value
|
|
|
|
if not hasattr(estimator, attr):
|
|
setattr(estimator, attr, new_value)
|
|
|
|
|
|
def get_kfold_cross_validation(estimator, X, y=None, scoring=None, kfolds=None):
|
|
"""Return a dataframe of kfold cross validation scores.
|
|
|
|
If the estimator is a classifier and y is either binary or multiclass,
|
|
sklearn's StratifiedKFold is used. In all other cases, normal KFold is used.
|
|
|
|
See http://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
|
|
for additional information on scoring options
|
|
|
|
Args:
|
|
estimator (object): sklearn-compatible estimator
|
|
X (dataframe): feature dataframe
|
|
y (pd series): target series
|
|
kfolds (int): the number of folds in the (Stratified)KFold
|
|
|
|
Returns:
|
|
cv_df (dataframe): the cross validation scores
|
|
|
|
Raises:
|
|
RuntimeError: if kfold_cv is not supported
|
|
"""
|
|
check_kfold_cv_available()
|
|
from sklearn.model_selection import cross_validate
|
|
|
|
raw_scores = cross_validate(estimator, X, y, scoring=scoring, cv=kfolds)
|
|
# We only need the testing scores, not the training scores
|
|
test_keys = ['test_' + metric for metric in scoring]
|
|
scores = [raw_scores.get(k) for k in test_keys]
|
|
cv_df = pd.DataFrame(scores).T
|
|
cv_df.columns = scoring
|
|
return cv_df
|
|
|
|
|
|
def _cost_complexity_pruning_path(estimator, df, options):
|
|
"""Computes the normalized pruning path during minimal cost-complexity pruning and
|
|
returns the value of alpha that corresponds to the regul proportion input parameter.
|
|
"""
|
|
# Make a copy of data, to not alter original dataframe
|
|
X = df.copy()
|
|
|
|
# Prepare the dataset
|
|
target = options.get('target_variable')[0]
|
|
X, y, _ = df_util.prepare_features_and_target(
|
|
X=X,
|
|
variables=options.get('target_variable') + (options.get('feature_variables') or []),
|
|
target=target,
|
|
mlspl_limits=options.get('mlspl_limits', {}),
|
|
)
|
|
|
|
# compute pruning path
|
|
alphas = estimator.cost_complexity_pruning_path(X, y)['ccp_alphas']
|
|
|
|
# clip to zero to avoid numerical precision issues
|
|
return np.clip(alphas, 0, None)
|