From f71c014a59bc31e094e570361b4adb32c8c98941 Mon Sep 17 00:00:00 2001 From: Sam Sharpe Date: Tue, 12 Oct 2021 15:02:14 -0400 Subject: [PATCH 1/2] replace lambdas for parallel processing --- lime/discretize.py | 122 +++-- lime/lime_base.py | 110 ++--- lime/lime_tabular.py | 433 ++++++++++------- lime/tests/test_lime_tabular.py | 813 ++++++++++++++++---------------- 4 files changed, 800 insertions(+), 678 deletions(-) diff --git a/lime/discretize.py b/lime/discretize.py index df3df0a1..278c9028 100644 --- a/lime/discretize.py +++ b/lime/discretize.py @@ -1,15 +1,17 @@ """ Discretizers classes, to be used in lime_tabular """ +from abc import ABCMeta, abstractmethod +from functools import partial + import numpy as np +import scipy import sklearn import sklearn.tree -import scipy from sklearn.utils import check_random_state -from abc import ABCMeta, abstractmethod -class BaseDiscretizer(): +class BaseDiscretizer: """ Abstract class - Build a class that inherits from this class to implement a custom discretizer. @@ -19,8 +21,15 @@ class BaseDiscretizer(): __metaclass__ = ABCMeta # abstract class - def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, - data_stats=None): + def __init__( + self, + data, + categorical_features, + feature_names, + labels=None, + random_state=None, + data_stats=None, + ): """Initializer Args: data: numpy 2d array @@ -36,8 +45,7 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this if you don't want these values to be computed from data """ - self.to_discretize = ([x for x in range(data.shape[1]) - if x not in categorical_features]) + self.to_discretize = [x for x in range(data.shape[1]) if x not in categorical_features] self.data_stats = data_stats self.names = {} self.lambdas = {} @@ -63,13 +71,12 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando boundaries = np.min(data[:, feature]), np.max(data[:, feature]) name = feature_names[feature] - self.names[feature] = ['%s <= %.2f' % (name, qts[0])] + self.names[feature] = ["%s <= %.2f" % (name, qts[0])] for i in range(n_bins - 1): - self.names[feature].append('%.2f < %s <= %.2f' % - (qts[i], name, qts[i + 1])) - self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1])) + self.names[feature].append("%.2f < %s <= %.2f" % (qts[i], name, qts[i + 1])) + self.names[feature].append("%s > %.2f" % (name, qts[n_bins - 1])) - self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x) + self.lambdas[feature] = partial(self.discretizer_fn, qts=qts) discretized = self.lambdas[feature](data[:, feature]) # If data stats are provided no need to compute the below set of details @@ -88,6 +95,10 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando self.mins[feature] = [boundaries[0]] + qts.tolist() self.maxs[feature] = qts.tolist() + [boundaries[1]] + @staticmethod + def discretizer_fn(x, qts): + return np.searchsorted(qts, x) + @abstractmethod def bins(self, data, labels): """ @@ -109,8 +120,7 @@ def discretize(self, data): if len(data.shape) == 1: ret[feature] = int(self.lambdas[feature](ret[feature])) else: - ret[:, feature] = self.lambdas[feature]( - ret[:, feature]).astype(int) + ret[:, feature] = self.lambdas[feature](ret[:, feature]).astype(int) return ret def get_undiscretize_values(self, feature, values): @@ -121,7 +131,7 @@ def get_undiscretize_values(self, feature, values): stds = np.array(self.stds[feature])[values] minz = (mins - means) / stds maxz = (maxs - means) / stds - min_max_unequal = (minz != maxz) + min_max_unequal = minz != maxz ret = minz ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs( @@ -129,7 +139,7 @@ def get_undiscretize_values(self, feature, values): maxz[min_max_unequal], loc=means[min_max_unequal], scale=stds[min_max_unequal], - random_state=self.random_state + random_state=self.random_state, ) return ret @@ -141,9 +151,7 @@ def undiscretize(self, data): feature, ret[feature].astype(int).reshape(-1, 1) ) else: - ret[:, feature] = self.get_undiscretize_values( - feature, ret[:, feature].astype(int) - ) + ret[:, feature] = self.get_undiscretize_values(feature, ret[:, feature].astype(int)) return ret @@ -152,13 +160,25 @@ class StatsDiscretizer(BaseDiscretizer): Class to be used to supply the data stats info when discretize_continuous is true """ - def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, - data_stats=None): - - BaseDiscretizer.__init__(self, data, categorical_features, - feature_names, labels=labels, - random_state=random_state, - data_stats=data_stats) + def __init__( + self, + data, + categorical_features, + feature_names, + labels=None, + random_state=None, + data_stats=None, + ): + + BaseDiscretizer.__init__( + self, + data, + categorical_features, + feature_names, + labels=labels, + random_state=random_state, + data_stats=data_stats, + ) def bins(self, data, labels): bins_from_stats = self.data_stats.get("bins") @@ -175,9 +195,14 @@ def bins(self, data, labels): class QuartileDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): - BaseDiscretizer.__init__(self, data, categorical_features, - feature_names, labels=labels, - random_state=random_state) + BaseDiscretizer.__init__( + self, + data, + categorical_features, + feature_names, + labels=labels, + random_state=random_state, + ) def bins(self, data, labels): bins = [] @@ -189,35 +214,46 @@ def bins(self, data, labels): class DecileDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): - BaseDiscretizer.__init__(self, data, categorical_features, - feature_names, labels=labels, - random_state=random_state) + BaseDiscretizer.__init__( + self, + data, + categorical_features, + feature_names, + labels=labels, + random_state=random_state, + ) def bins(self, data, labels): bins = [] for feature in self.to_discretize: - qts = np.array(np.percentile(data[:, feature], - [10, 20, 30, 40, 50, 60, 70, 80, 90])) + qts = np.array(np.percentile(data[:, feature], [10, 20, 30, 40, 50, 60, 70, 80, 90])) bins.append(qts) return bins class EntropyDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): - if(labels is None): - raise ValueError('Labels must be not None when using \ - EntropyDiscretizer') - BaseDiscretizer.__init__(self, data, categorical_features, - feature_names, labels=labels, - random_state=random_state) + if labels is None: + raise ValueError( + "Labels must be not None when using \ + EntropyDiscretizer" + ) + BaseDiscretizer.__init__( + self, + data, + categorical_features, + feature_names, + labels=labels, + random_state=random_state, + ) def bins(self, data, labels): bins = [] for feature in self.to_discretize: # Entropy splitting / at most 8 bins so max_depth=3 - dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy', - max_depth=3, - random_state=self.random_state) + dt = sklearn.tree.DecisionTreeClassifier( + criterion="entropy", max_depth=3, random_state=self.random_state + ) x = np.reshape(data[:, feature], (-1, 1)) dt.fit(x, labels) qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)] diff --git a/lime/lime_base.py b/lime/lime_base.py index d8a09317..fe386aa0 100644 --- a/lime/lime_base.py +++ b/lime/lime_base.py @@ -9,10 +9,8 @@ class LimeBase(object): """Class for learning a locally linear sparse model from perturbed data""" - def __init__(self, - kernel_fn, - verbose=False, - random_state=None): + + def __init__(self, kernel_fn, verbose=False, random_state=None): """Init function Args: @@ -40,10 +38,7 @@ def generate_lars_path(weighted_data, weighted_labels): regularization parameter and coefficients, respectively """ x_vector = weighted_data - alphas, _, coefs = lars_path(x_vector, - weighted_labels, - method='lasso', - verbose=False) + alphas, _, coefs = lars_path(x_vector, weighted_labels, method="lasso", verbose=False) return alphas, coefs def forward_selection(self, data, labels, weights, num_features): @@ -56,11 +51,8 @@ def forward_selection(self, data, labels, weights, num_features): for feature in range(data.shape[1]): if feature in used_features: continue - clf.fit(data[:, used_features + [feature]], labels, - sample_weight=weights) - score = clf.score(data[:, used_features + [feature]], - labels, - sample_weight=weights) + clf.fit(data[:, used_features + [feature]], labels, sample_weight=weights) + score = clf.score(data[:, used_features + [feature]], labels, sample_weight=weights) if score > max_: best = feature max_ = score @@ -70,13 +62,12 @@ def forward_selection(self, data, labels, weights, num_features): def feature_selection(self, data, labels, weights, num_features, method): """Selects features for the model. see explain_instance_with_data to understand the parameters.""" - if method == 'none': + if method == "none": return np.array(range(data.shape[1])) - elif method == 'forward_selection': + elif method == "forward_selection": return self.forward_selection(data, labels, weights, num_features) - elif method == 'highest_weights': - clf = Ridge(alpha=0.01, fit_intercept=True, - random_state=self.random_state) + elif method == "highest_weights": + clf = Ridge(alpha=0.01, fit_intercept=True, random_state=self.random_state) clf.fit(data, labels, sample_weight=weights) coef = clf.coef_ @@ -102,7 +93,7 @@ def feature_selection(self, data, labels, weights, num_features, method): if pad_counter >= num_to_pad: break else: - nnz_indexes = argsort_data[sdata - num_features:sdata][::-1] + nnz_indexes = argsort_data[sdata - num_features : sdata][::-1] indices = weighted_data.indices[nnz_indexes] return indices else: @@ -110,38 +101,39 @@ def feature_selection(self, data, labels, weights, num_features, method): feature_weights = sorted( zip(range(data.shape[1]), weighted_data), key=lambda x: np.abs(x[1]), - reverse=True) + reverse=True, + ) return np.array([x[0] for x in feature_weights[:num_features]]) - elif method == 'lasso_path': - weighted_data = ((data - np.average(data, axis=0, weights=weights)) - * np.sqrt(weights[:, np.newaxis])) - weighted_labels = ((labels - np.average(labels, weights=weights)) - * np.sqrt(weights)) + elif method == "lasso_path": + weighted_data = (data - np.average(data, axis=0, weights=weights)) * np.sqrt( + weights[:, np.newaxis] + ) + weighted_labels = (labels - np.average(labels, weights=weights)) * np.sqrt(weights) nonzero = range(weighted_data.shape[1]) - _, coefs = self.generate_lars_path(weighted_data, - weighted_labels) + _, coefs = self.generate_lars_path(weighted_data, weighted_labels) for i in range(len(coefs.T) - 1, 0, -1): nonzero = coefs.T[i].nonzero()[0] if len(nonzero) <= num_features: break used_features = nonzero return used_features - elif method == 'auto': + elif method == "auto": if num_features <= 6: - n_method = 'forward_selection' + n_method = "forward_selection" else: - n_method = 'highest_weights' - return self.feature_selection(data, labels, weights, - num_features, n_method) - - def explain_instance_with_data(self, - neighborhood_data, - neighborhood_labels, - distances, - label, - num_features, - feature_selection='auto', - model_regressor=None): + n_method = "highest_weights" + return self.feature_selection(data, labels, weights, num_features, n_method) + + def explain_instance_with_data( + self, + neighborhood_data, + neighborhood_labels, + distances, + label, + num_features, + feature_selection="auto", + model_regressor=None, + ): """Takes perturbed data, labels and distances, returns explanation. Args: @@ -180,28 +172,28 @@ def explain_instance_with_data(self, weights = self.kernel_fn(distances) labels_column = neighborhood_labels[:, label] - used_features = self.feature_selection(neighborhood_data, - labels_column, - weights, - num_features, - feature_selection) + used_features = self.feature_selection( + neighborhood_data, labels_column, weights, num_features, feature_selection + ) if model_regressor is None: - model_regressor = Ridge(alpha=1, fit_intercept=True, - random_state=self.random_state) + model_regressor = Ridge(alpha=1, fit_intercept=True, random_state=self.random_state) easy_model = model_regressor - easy_model.fit(neighborhood_data[:, used_features], - labels_column, sample_weight=weights) + easy_model.fit(neighborhood_data[:, used_features], labels_column, sample_weight=weights) prediction_score = easy_model.score( - neighborhood_data[:, used_features], - labels_column, sample_weight=weights) + neighborhood_data[:, used_features], labels_column, sample_weight=weights + ) local_pred = easy_model.predict(neighborhood_data[0, used_features].reshape(1, -1)) if self.verbose: - print('Intercept', easy_model.intercept_) - print('Prediction_local', local_pred,) - print('Right:', neighborhood_labels[0, label]) - return (easy_model.intercept_, - sorted(zip(used_features, easy_model.coef_), - key=lambda x: np.abs(x[1]), reverse=True), - prediction_score, local_pred) + print("Intercept", easy_model.intercept_) + print( + "Prediction_local", local_pred, + ) + print("Right:", neighborhood_labels[0, label]) + return ( + easy_model.intercept_, + sorted(zip(used_features, easy_model.coef_), key=lambda x: np.abs(x[1]), reverse=True), + prediction_score, + local_pred, + ) diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py index 880f3d39..07153f4e 100644 --- a/lime/lime_tabular.py +++ b/lime/lime_tabular.py @@ -3,33 +3,41 @@ """ import collections import copy -from functools import partial import json import warnings +from functools import partial import numpy as np import scipy as sp import sklearn import sklearn.preprocessing +from scipy.stats.distributions import norm from sklearn.utils import check_random_state + +from lime.discretize import ( + BaseDiscretizer, + DecileDiscretizer, + EntropyDiscretizer, + QuartileDiscretizer, + StatsDiscretizer, +) from pyDOE2 import lhs -from scipy.stats.distributions import norm -from lime.discretize import QuartileDiscretizer -from lime.discretize import DecileDiscretizer -from lime.discretize import EntropyDiscretizer -from lime.discretize import BaseDiscretizer -from lime.discretize import StatsDiscretizer -from . import explanation -from . import lime_base +from . import explanation, lime_base class TableDomainMapper(explanation.DomainMapper): """Maps feature ids to names, generates table views, etc""" - def __init__(self, feature_names, feature_values, scaled_row, - categorical_features, discretized_feature_names=None, - feature_indexes=None): + def __init__( + self, + feature_names, + feature_values, + scaled_row, + categorical_features, + discretized_feature_names=None, + feature_indexes=None, + ): """Init. Args: @@ -65,13 +73,9 @@ def map_exp_ids(self, exp): names = self.discretized_feature_names return [(names[x[0]], x[1]) for x in exp] - def visualize_instance_html(self, - exp, - label, - div_name, - exp_object_name, - show_table=True, - show_all=False): + def visualize_instance_html( + self, exp, label, div_name, exp_object_name, show_table=True, show_all=False, + ): """Shows the current example in a table format. Args: @@ -83,7 +87,7 @@ def visualize_instance_html(self, show_all: if True, show zero-weighted features in the table. """ if not show_table: - return '' + return "" weights = [0] * len(self.feature_names) for x in exp: weights[x[0]] = x[1] @@ -92,25 +96,27 @@ def visualize_instance_html(self, fnames = [self.exp_feature_names[i] for i in self.feature_indexes] fweights = [weights[i] for i in self.feature_indexes] if show_all: - out_list = list(zip(fnames, - self.feature_values, - fweights)) + out_list = list(zip(fnames, self.feature_values, fweights)) else: - out_dict = dict(map(lambda x: (x[0], (x[1], x[2], x[3])), - zip(self.feature_indexes, - fnames, - self.feature_values, - fweights))) + out_dict = dict( + map( + lambda x: (x[0], (x[1], x[2], x[3])), + zip(self.feature_indexes, fnames, self.feature_values, fweights,), + ) + ) out_list = [out_dict.get(x[0], (str(x[0]), 0.0, 0.0)) for x in exp] else: - out_list = list(zip(self.exp_feature_names, - self.feature_values, - weights)) + out_list = list(zip(self.exp_feature_names, self.feature_values, weights)) if not show_all: out_list = [out_list[x[0]] for x in exp] - ret = u''' + ret = u""" %s.show_raw_tabular(%s, %d, %s); - ''' % (exp_object_name, json.dumps(out_list, ensure_ascii=False), label, div_name) + """ % ( + exp_object_name, + json.dumps(out_list, ensure_ascii=False), + label, + div_name, + ) return ret @@ -123,23 +129,25 @@ class LimeTabularExplainer(object): feature that is 1 when the value is the same as the instance being explained.""" - def __init__(self, - training_data, - mode="classification", - training_labels=None, - feature_names=None, - categorical_features=None, - categorical_names=None, - kernel_width=None, - kernel=None, - verbose=False, - class_names=None, - feature_selection='auto', - discretize_continuous=True, - discretizer='quartile', - sample_around_instance=False, - random_state=None, - training_data_stats=None): + def __init__( + self, + training_data, + mode="classification", + training_labels=None, + feature_names=None, + categorical_features=None, + categorical_names=None, + kernel_width=None, + kernel=None, + verbose=False, + class_names=None, + feature_selection="auto", + discretize_continuous=True, + discretizer="quartile", + sample_around_instance=False, + random_state=None, + training_data_stats=None, + ): """Init function. Args: @@ -209,46 +217,58 @@ def __init__(self, # Set the discretizer if training data stats are provided if self.training_data_stats: discretizer = StatsDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels, + training_data, + self.categorical_features, + self.feature_names, + labels=training_labels, data_stats=self.training_data_stats, - random_state=self.random_state) + random_state=self.random_state, + ) - if discretizer == 'quartile': + if discretizer == "quartile": self.discretizer = QuartileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels, - random_state=self.random_state) - elif discretizer == 'decile': + training_data, + self.categorical_features, + self.feature_names, + labels=training_labels, + random_state=self.random_state, + ) + elif discretizer == "decile": self.discretizer = DecileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels, - random_state=self.random_state) - elif discretizer == 'entropy': + training_data, + self.categorical_features, + self.feature_names, + labels=training_labels, + random_state=self.random_state, + ) + elif discretizer == "entropy": self.discretizer = EntropyDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels, - random_state=self.random_state) + training_data, + self.categorical_features, + self.feature_names, + labels=training_labels, + random_state=self.random_state, + ) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: - raise ValueError('''Discretizer must be 'quartile',''' + - ''' 'decile', 'entropy' or a''' + - ''' BaseDiscretizer instance''') + raise ValueError( + """Discretizer must be 'quartile',""" + + """ 'decile', 'entropy' or a""" + + """ BaseDiscretizer instance""" + ) self.categorical_features = list(range(training_data.shape[1])) # Get the discretized_training_data when the stats are not provided - if(self.training_data_stats is None): - discretized_training_data = self.discretizer.discretize( - training_data) + if self.training_data_stats is None: + discretized_training_data = self.discretizer.discretize(training_data) if kernel_width is None: - kernel_width = np.sqrt(training_data.shape[1]) * .75 + kernel_width = np.sqrt(training_data.shape[1]) * 0.75 kernel_width = float(kernel_width) if kernel is None: - def kernel(d, kernel_width): - return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) + kernel = self.default_kernel_fn kernel_fn = partial(kernel, kernel_width=kernel_width) @@ -276,14 +296,17 @@ def kernel(d, kernel_width): frequencies = training_data_stats["feature_frequencies"][feature] self.feature_values[feature] = values - self.feature_frequencies[feature] = (np.array(frequencies) / - float(sum(frequencies))) + self.feature_frequencies[feature] = np.array(frequencies) / float(sum(frequencies)) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1 + @staticmethod + def default_kernel_fn(d, kernel_width): + return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) + @staticmethod def convert_and_round(values): - return ['%.2f' % v for v in values] + return ["%.2f" % v for v in values] @staticmethod def validate_training_data_stats(training_data_stats): @@ -291,21 +314,30 @@ def validate_training_data_stats(training_data_stats): Method to validate the structure of training data stats """ stat_keys = list(training_data_stats.keys()) - valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"] + valid_stat_keys = [ + "means", + "mins", + "maxs", + "stds", + "feature_values", + "feature_frequencies", + ] missing_keys = list(set(valid_stat_keys) - set(stat_keys)) if len(missing_keys) > 0: raise Exception("Missing keys in training_data_stats. Details: %s" % (missing_keys)) - def explain_instance(self, - data_row, - predict_fn, - labels=(1,), - top_labels=None, - num_features=10, - num_samples=5000, - distance_metric='euclidean', - model_regressor=None, - sampling_method='gaussian'): + def explain_instance( + self, + data_row, + predict_fn, + labels=(1,), + top_labels=None, + num_features=10, + num_samples=5000, + distance_metric="euclidean", + model_regressor=None, + sampling_method="gaussian", + ): """Generates explanations for a prediction. First, we generate neighborhood data by randomly perturbing features @@ -353,9 +385,7 @@ def explain_instance(self, else: scaled_data = (data - self.scaler.mean_) / self.scaler.scale_ distances = sklearn.metrics.pairwise_distances( - scaled_data, - scaled_data[0].reshape(1, -1), - metric=distance_metric + scaled_data, scaled_data[0].reshape(1, -1), metric=distance_metric ).ravel() yss = predict_fn(inverse) @@ -364,26 +394,31 @@ def explain_instance(self, # along with prediction probabilities if self.mode == "classification": if len(yss.shape) == 1: - raise NotImplementedError("LIME does not currently support " - "classifier models without probability " - "scores. If this conflicts with your " - "use case, please let us know: " - "https://github.com/datascienceinc/lime/issues/16") + raise NotImplementedError( + "LIME does not currently support " + "classifier models without probability " + "scores. If this conflicts with your " + "use case, please let us know: " + "https://github.com/datascienceinc/lime/issues/16" + ) elif len(yss.shape) == 2: if self.class_names is None: self.class_names = [str(x) for x in range(yss[0].shape[0])] else: self.class_names = list(self.class_names) if not np.allclose(yss.sum(axis=1), 1.0): - warnings.warn(""" + warnings.warn( + """ Prediction probabilties do not sum to 1, and thus does not constitute a probability space. Check that you classifier outputs probabilities (Not log probabilities, or actual class predictions). - """) + """ + ) else: - raise ValueError("Your model outputs " - "arrays with {} dimensions".format(len(yss.shape))) + raise ValueError( + "Your model outputs " "arrays with {} dimensions".format(len(yss.shape)) + ) # for regression, the output should be a one-dimensional array of predictions else: @@ -392,8 +427,12 @@ def explain_instance(self, yss = np.array([v[0] for v in yss]) assert isinstance(yss, np.ndarray) and len(yss.shape) == 1 except AssertionError: - raise ValueError("Your model needs to output single-dimensional \ - numpyarrays, not arrays of {} dimensions".format(yss.shape)) + raise ValueError( + "Your model needs to output single-dimensional \ + numpyarrays, not arrays of {} dimensions".format( + yss.shape + ) + ) predicted_value = yss[0] min_y = min(yss) @@ -419,8 +458,8 @@ def explain_instance(self, name = int(data_row[i]) if i in self.categorical_names: name = self.categorical_names[i][name] - feature_names[i] = '%s=%s' % (feature_names[i], name) - values[i] = 'True' + feature_names[i] = "%s=%s" % (feature_names[i], name) + values[i] = "True" categorical_features = self.categorical_features discretized_feature_names = None @@ -429,18 +468,21 @@ def explain_instance(self, discretized_instance = self.discretizer.discretize(data_row) discretized_feature_names = copy.deepcopy(feature_names) for f in self.discretizer.names: - discretized_feature_names[f] = self.discretizer.names[f][int( - discretized_instance[f])] - - domain_mapper = TableDomainMapper(feature_names, - values, - scaled_data[0], - categorical_features=categorical_features, - discretized_feature_names=discretized_feature_names, - feature_indexes=feature_indexes) - ret_exp = explanation.Explanation(domain_mapper, - mode=self.mode, - class_names=self.class_names) + discretized_feature_names[f] = self.discretizer.names[f][ + int(discretized_instance[f]) + ] + + domain_mapper = TableDomainMapper( + feature_names, + values, + scaled_data[0], + categorical_features=categorical_features, + discretized_feature_names=discretized_feature_names, + feature_indexes=feature_indexes, + ) + ret_exp = explanation.Explanation( + domain_mapper, mode=self.mode, class_names=self.class_names + ) if self.mode == "classification": ret_exp.predict_proba = yss[0] if top_labels: @@ -453,17 +495,20 @@ def explain_instance(self, ret_exp.max_value = max_y labels = [0] for label in labels: - (ret_exp.intercept[label], - ret_exp.local_exp[label], - ret_exp.score[label], - ret_exp.local_pred[label]) = self.base.explain_instance_with_data( - scaled_data, - yss, - distances, - label, - num_features, - model_regressor=model_regressor, - feature_selection=self.feature_selection) + ( + ret_exp.intercept[label], + ret_exp.local_exp[label], + ret_exp.score[label], + ret_exp.local_pred[label], + ) = self.base.explain_instance_with_data( + scaled_data, + yss, + distances, + label, + num_features, + model_regressor=model_regressor, + feature_selection=self.feature_selection, + ) if self.mode == "regression": ret_exp.intercept[1] = ret_exp.intercept[0] @@ -472,10 +517,7 @@ def explain_instance(self, return ret_exp - def __data_inverse(self, - data_row, - num_samples, - sampling_method): + def __data_inverse(self, data_row, num_samples, sampling_method): """Generates a neighborhood around a prediction. For numerical features, perturb them by sampling from a Normal(0,1) and @@ -518,23 +560,27 @@ def __data_inverse(self, scale = scale[non_zero_indexes] mean = mean[non_zero_indexes] - if sampling_method == 'gaussian': - data = self.random_state.normal(0, 1, num_samples * num_cols - ).reshape(num_samples, num_cols) + if sampling_method == "gaussian": + data = self.random_state.normal(0, 1, num_samples * num_cols).reshape( + num_samples, num_cols + ) data = np.array(data) - elif sampling_method == 'lhs': - data = lhs(num_cols, samples=num_samples - ).reshape(num_samples, num_cols) + elif sampling_method == "lhs": + data = lhs(num_cols, samples=num_samples).reshape(num_samples, num_cols) means = np.zeros(num_cols) - stdvs = np.array([1]*num_cols) + stdvs = np.array([1] * num_cols) for i in range(num_cols): data[:, i] = norm(loc=means[i], scale=stdvs[i]).ppf(data[:, i]) data = np.array(data) else: - warnings.warn('''Invalid input for sampling_method. - Defaulting to Gaussian sampling.''', UserWarning) - data = self.random_state.normal(0, 1, num_samples * num_cols - ).reshape(num_samples, num_cols) + warnings.warn( + """Invalid input for sampling_method. + Defaulting to Gaussian sampling.""", + UserWarning, + ) + data = self.random_state.normal(0, 1, num_samples * num_cols).reshape( + num_samples, num_cols + ) data = np.array(data) if self.sample_around_instance: @@ -543,19 +589,19 @@ def __data_inverse(self, data = data * scale + mean if is_sparse: if num_cols == 0: - data = sp.sparse.csr_matrix((num_samples, - data_row.shape[1]), - dtype=data_row.dtype) + data = sp.sparse.csr_matrix( + (num_samples, data_row.shape[1]), dtype=data_row.dtype + ) else: indexes = np.tile(non_zero_indexes, num_samples) indptr = np.array( - range(0, len(non_zero_indexes) * (num_samples + 1), - len(non_zero_indexes))) + range(0, len(non_zero_indexes) * (num_samples + 1), len(non_zero_indexes),) + ) data_1d_shape = data.shape[0] * data.shape[1] data_1d = data.reshape(data_1d_shape) data = sp.sparse.csr_matrix( - (data_1d, indexes, indptr), - shape=(num_samples, data_row.shape[1])) + (data_1d, indexes, indptr), shape=(num_samples, data_row.shape[1]), + ) categorical_features = self.categorical_features first_row = data_row else: @@ -565,8 +611,9 @@ def __data_inverse(self, for column in categorical_features: values = self.feature_values[column] freqs = self.feature_frequencies[column] - inverse_column = self.random_state.choice(values, size=num_samples, - replace=True, p=freqs) + inverse_column = self.random_state.choice( + values, size=num_samples, replace=True, p=freqs + ) binary_column = (inverse_column == first_row[column]).astype(int) binary_column[0] = 1 inverse_column[0] = data[0, column] @@ -593,12 +640,23 @@ class RecurrentTabularExplainer(LimeTabularExplainer): """ - def __init__(self, training_data, mode="classification", - training_labels=None, feature_names=None, - categorical_features=None, categorical_names=None, - kernel_width=None, kernel=None, verbose=False, class_names=None, - feature_selection='auto', discretize_continuous=True, - discretizer='quartile', random_state=None): + def __init__( + self, + training_data, + mode="classification", + training_labels=None, + feature_names=None, + categorical_features=None, + categorical_names=None, + kernel_width=None, + kernel=None, + verbose=False, + class_names=None, + feature_selection="auto", + discretize_continuous=True, + discretizer="quartile", + random_state=None, + ): """ Args: training_data: numpy 3d array with shape @@ -640,32 +698,37 @@ def __init__(self, training_data, mode="classification", # Reshape X n_samples, n_timesteps, n_features = training_data.shape training_data = np.transpose(training_data, axes=(0, 2, 1)).reshape( - n_samples, n_timesteps * n_features) + n_samples, n_timesteps * n_features + ) self.n_timesteps = n_timesteps self.n_features = n_features if feature_names is None: - feature_names = ['feature%d' % i for i in range(n_features)] + feature_names = ["feature%d" % i for i in range(n_features)] # Update the feature names - feature_names = ['{}_t-{}'.format(n, n_timesteps - (i + 1)) - for n in feature_names for i in range(n_timesteps)] + feature_names = [ + "{}_t-{}".format(n, n_timesteps - (i + 1)) + for n in feature_names + for i in range(n_timesteps) + ] # Send off the the super class to do its magic. super(RecurrentTabularExplainer, self).__init__( - training_data, - mode=mode, - training_labels=training_labels, - feature_names=feature_names, - categorical_features=categorical_features, - categorical_names=categorical_names, - kernel_width=kernel_width, - kernel=kernel, - verbose=verbose, - class_names=class_names, - feature_selection=feature_selection, - discretize_continuous=discretize_continuous, - discretizer=discretizer, - random_state=random_state) + training_data, + mode=mode, + training_labels=training_labels, + feature_names=feature_names, + categorical_features=categorical_features, + categorical_names=categorical_names, + kernel_width=kernel_width, + kernel=kernel, + verbose=verbose, + class_names=class_names, + feature_selection=feature_selection, + discretize_continuous=discretize_continuous, + discretizer=discretizer, + random_state=random_state, + ) def _make_predict_proba(self, func): """ @@ -683,9 +746,17 @@ def predict_proba(X): return predict_proba - def explain_instance(self, data_row, classifier_fn, labels=(1,), - top_labels=None, num_features=10, num_samples=5000, - distance_metric='euclidean', model_regressor=None): + def explain_instance( + self, + data_row, + classifier_fn, + labels=(1,), + top_labels=None, + num_features=10, + num_samples=5000, + distance_metric="euclidean", + model_regressor=None, + ): """Generates explanations for a prediction. First, we generate neighborhood data by randomly perturbing features @@ -721,10 +792,12 @@ def explain_instance(self, data_row, classifier_fn, labels=(1,), # Wrap the classifier to reshape input classifier_fn = self._make_predict_proba(classifier_fn) return super(RecurrentTabularExplainer, self).explain_instance( - data_row, classifier_fn, + data_row, + classifier_fn, labels=labels, top_labels=top_labels, num_features=num_features, num_samples=num_samples, distance_metric=distance_metric, - model_regressor=model_regressor) + model_regressor=model_regressor, + ) diff --git a/lime/tests/test_lime_tabular.py b/lime/tests/test_lime_tabular.py index 426079b4..5c0e923a 100644 --- a/lime/tests/test_lime_tabular.py +++ b/lime/tests/test_lime_tabular.py @@ -1,7 +1,9 @@ +import collections +import multiprocessing as mp import unittest +from functools import partial import numpy as np -import collections import sklearn # noqa import sklearn.datasets import sklearn.ensemble @@ -10,8 +12,10 @@ from sklearn.datasets import load_iris, make_classification, make_multilabel_classification from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression -from lime.discretize import QuartileDiscretizer, DecileDiscretizer, EntropyDiscretizer +from lime.discretize import DecileDiscretizer, EntropyDiscretizer, QuartileDiscretizer +from lime.explanation import Explanation +from lime.lime_tabular import LimeTabularExplainer try: from sklearn.model_selection import train_test_split @@ -19,21 +23,17 @@ # Deprecated in scikit-learn version 0.18, removed in 0.20 from sklearn.cross_validation import train_test_split -from lime.lime_tabular import LimeTabularExplainer - class TestLimeTabular(unittest.TestCase): - def setUp(self): iris = load_iris() self.feature_names = iris.feature_names self.target_names = iris.target_names - (self.train, - self.test, - self.labels_train, - self.labels_test) = train_test_split(iris.data, iris.target, train_size=0.80) + (self.train, self.test, self.labels_train, self.labels_test) = train_test_split( + iris.data, iris.target, train_size=0.80 + ) def test_lime_explainer_good_regressor(self): np.random.seed(1) @@ -41,40 +41,39 @@ def test_lime_explainer_good_regressor(self): rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) - explainer = LimeTabularExplainer(self.train, - mode="classification", - feature_names=self.feature_names, - class_names=self.target_names, - discretize_continuous=True) + explainer = LimeTabularExplainer( + self.train, + mode="classification", + feature_names=self.feature_names, + class_names=self.target_names, + discretize_continuous=True, + ) - exp = explainer.explain_instance(self.test[i], - rf.predict_proba, - num_features=2, - model_regressor=LinearRegression()) + exp = explainer.explain_instance( + self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression() + ) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] - self.assertEqual(1, - sum([1 if 'petal width' in x else 0 for x in keys]), - "Petal Width is a major feature") - self.assertEqual(1, - sum([1 if 'petal length' in x else 0 for x in keys]), - "Petal Length is a major feature") + self.assertEqual( + 1, sum([1 if "petal width" in x else 0 for x in keys]), "Petal Width is a major feature" + ) + self.assertEqual( + 1, + sum([1 if "petal length" in x else 0 for x in keys]), + "Petal Length is a major feature", + ) def test_lime_explainer_good_regressor_synthetic_data(self): - X, y = make_classification(n_samples=1000, - n_features=20, - n_informative=2, - n_redundant=2, - random_state=10) + X, y = make_classification( + n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10 + ) rf = RandomForestClassifier(n_estimators=500) rf.fit(X, y) instance = np.random.randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] - explainer = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True) + explainer = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True) exp = explainer.explain_instance(X[instance], rf.predict_proba) @@ -83,18 +82,14 @@ def test_lime_explainer_good_regressor_synthetic_data(self): def test_lime_explainer_sparse_synthetic_data(self): n_features = 20 - X, y = make_multilabel_classification(n_samples=100, - sparse=True, - n_features=n_features, - n_classes=1, - n_labels=2) + X, y = make_multilabel_classification( + n_samples=100, sparse=True, n_features=n_features, n_classes=1, n_labels=2 + ) rf = RandomForestClassifier(n_estimators=500) rf.fit(X, y) instance = np.random.randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(n_features)] - explainer = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True) + explainer = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True) exp = explainer.explain_instance(X[instance], rf.predict_proba) @@ -108,22 +103,24 @@ def test_lime_explainer_no_regressor(self): rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) - explainer = LimeTabularExplainer(self.train, - feature_names=self.feature_names, - class_names=self.target_names, - discretize_continuous=True) + explainer = LimeTabularExplainer( + self.train, + feature_names=self.feature_names, + class_names=self.target_names, + discretize_continuous=True, + ) - exp = explainer.explain_instance(self.test[i], - rf.predict_proba, - num_features=2) + exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] - self.assertEqual(1, - sum([1 if 'petal width' in x else 0 for x in keys]), - "Petal Width is a major feature") - self.assertEqual(1, - sum([1 if 'petal length' in x else 0 for x in keys]), - "Petal Length is a major feature") + self.assertEqual( + 1, sum([1 if "petal width" in x else 0 for x in keys]), "Petal Width is a major feature" + ) + self.assertEqual( + 1, + sum([1 if "petal length" in x else 0 for x in keys]), + "Petal Length is a major feature", + ) def test_lime_explainer_entropy_discretizer(self): np.random.seed(1) @@ -132,32 +129,32 @@ def test_lime_explainer_entropy_discretizer(self): rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) - explainer = LimeTabularExplainer(self.train, - feature_names=self.feature_names, - class_names=self.target_names, - training_labels=self.labels_train, - discretize_continuous=True, - discretizer='entropy') + explainer = LimeTabularExplainer( + self.train, + feature_names=self.feature_names, + class_names=self.target_names, + training_labels=self.labels_train, + discretize_continuous=True, + discretizer="entropy", + ) - exp = explainer.explain_instance(self.test[i], - rf.predict_proba, - num_features=2) + exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] print(keys) - self.assertEqual(1, - sum([1 if 'petal width' in x else 0 for x in keys]), - "Petal Width is a major feature") - self.assertEqual(1, - sum([1 if 'petal length' in x else 0 for x in keys]), - "Petal Length is a major feature") + self.assertEqual( + 1, sum([1 if "petal width" in x else 0 for x in keys]), "Petal Width is a major feature" + ) + self.assertEqual( + 1, + sum([1 if "petal length" in x else 0 for x in keys]), + "Petal Length is a major feature", + ) def test_lime_tabular_explainer_equal_random_state(self): - X, y = make_classification(n_samples=1000, - n_features=20, - n_informative=2, - n_redundant=2, - random_state=10) + X, y = make_classification( + n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10 + ) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) @@ -167,84 +164,82 @@ def test_lime_tabular_explainer_equal_random_state(self): # ---------------------------------------------------------------------- # -------------------------Quartile Discretizer------------------------- # ---------------------------------------------------------------------- - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertDictEqual(exp_1.as_map(), exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Decile Discretizer-------------------------- # ---------------------------------------------------------------------- - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertDictEqual(exp_1.as_map(), exp_2.as_map()) # ---------------------------------------------------------------------- # -------------------------Entropy Discretizer-------------------------- # ---------------------------------------------------------------------- - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertDictEqual(exp_1.as_map(), exp_2.as_map()) def test_lime_tabular_explainer_not_equal_random_state(self): - X, y = make_classification(n_samples=1000, - n_features=20, - n_informative=2, - n_redundant=2, - random_state=10) + X, y = make_classification( + n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10 + ) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) @@ -256,94 +251,94 @@ def test_lime_tabular_explainer_not_equal_random_state(self): # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = QuartileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) @@ -352,94 +347,94 @@ def test_lime_tabular_explainer_not_equal_random_state(self): # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = DecileDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) @@ -448,137 +443,130 @@ def test_lime_tabular_explainer_not_equal_random_state(self): # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=10) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=10) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=10, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_1 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, - num_samples=500) - - discretizer = EntropyDiscretizer(X, [], feature_names, y, - random_state=20) - explainer_2 = LimeTabularExplainer(X, - feature_names=feature_names, - discretize_continuous=True, - discretizer=discretizer, - random_state=20) - exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, - num_samples=500) + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) + explainer_1 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) + + discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) + explainer_2 = LimeTabularExplainer( + X, + feature_names=feature_names, + discretize_continuous=True, + discretizer=discretizer, + random_state=20, + ) + exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) def testFeatureNamesAndCategoricalFeats(self): - training_data = np.array([[0., 1.], [1., 0.]]) + training_data = np.array([[0.0, 1.0], [1.0, 0.0]]) explainer = LimeTabularExplainer(training_data=training_data) - self.assertEqual(explainer.feature_names, ['0', '1']) + self.assertEqual(explainer.feature_names, ["0", "1"]) self.assertEqual(explainer.categorical_features, [0, 1]) explainer = LimeTabularExplainer( - training_data=training_data, - feature_names=np.array(['one', 'two']) + training_data=training_data, feature_names=np.array(["one", "two"]) ) - self.assertEqual(explainer.feature_names, ['one', 'two']) + self.assertEqual(explainer.feature_names, ["one", "two"]) explainer = LimeTabularExplainer( training_data=training_data, categorical_features=np.array([0]), - discretize_continuous=False + discretize_continuous=False, ) self.assertEqual(explainer.categorical_features, [0]) def testFeatureValues(self): - training_data = np.array([ - [0, 0, 2], - [1, 1, 0], - [0, 2, 2], - [1, 3, 0] - ]) + training_data = np.array([[0, 0, 2], [1, 1, 0], [0, 2, 2], [1, 3, 0]]) explainer = LimeTabularExplainer( - training_data=training_data, - categorical_features=[0, 1, 2] + training_data=training_data, categorical_features=[0, 1, 2] ) self.assertEqual(set(explainer.feature_values[0]), {0, 1}) self.assertEqual(set(explainer.feature_values[1]), {0, 1, 2, 3}) self.assertEqual(set(explainer.feature_values[2]), {0, 2}) - assert_array_equal(explainer.feature_frequencies[0], np.array([.5, .5])) - assert_array_equal(explainer.feature_frequencies[1], np.array([.25, .25, .25, .25])) - assert_array_equal(explainer.feature_frequencies[2], np.array([.5, .5])) + assert_array_equal(explainer.feature_frequencies[0], np.array([0.5, 0.5])) + assert_array_equal(explainer.feature_frequencies[1], np.array([0.25, 0.25, 0.25, 0.25])) + assert_array_equal(explainer.feature_frequencies[2], np.array([0.5, 0.5])) def test_lime_explainer_with_data_stats(self): np.random.seed(1) @@ -588,8 +576,9 @@ def test_lime_explainer_with_data_stats(self): i = np.random.randint(0, self.test.shape[0]) # Generate stats using a quartile descritizer - descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, - random_state=20) + descritizer = QuartileDiscretizer( + self.train, [], self.feature_names, self.target_names, random_state=20 + ) d_means = descritizer.means d_stds = descritizer.stds @@ -615,7 +604,7 @@ def test_lime_explainer_with_data_stats(self): index = 0 for bin in d_bins: d_bins_revised[index] = bin.tolist() - index = index+1 + index = index + 1 # Descritized stats data_stats = {} @@ -629,23 +618,55 @@ def test_lime_explainer_with_data_stats(self): data = np.zeros((2, len(self.feature_names))) explainer = LimeTabularExplainer( - data, feature_names=self.feature_names, random_state=10, - training_data_stats=data_stats, training_labels=self.target_names) + data, + feature_names=self.feature_names, + random_state=10, + training_data_stats=data_stats, + training_labels=self.target_names, + ) - exp = explainer.explain_instance(self.test[i], - rf.predict_proba, - num_features=2, - model_regressor=LinearRegression()) + exp = explainer.explain_instance( + self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression() + ) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] - self.assertEqual(1, - sum([1 if 'petal width' in x else 0 for x in keys]), - "Petal Width is a major feature") - self.assertEqual(1, - sum([1 if 'petal length' in x else 0 for x in keys]), - "Petal Length is a major feature") + self.assertEqual( + 1, sum([1 if "petal width" in x else 0 for x in keys]), "Petal Width is a major feature" + ) + self.assertEqual( + 1, + sum([1 if "petal length" in x else 0 for x in keys]), + "Petal Length is a major feature", + ) + + def test_lime_explainer_parallel(self): + np.random.seed(1) + + rf = RandomForestClassifier(n_estimators=500) + rf.fit(self.train, self.labels_train) + n_samples = min(self.test.shape[0], 20) + ind = np.random.choice(range(self.test.shape[0]), n_samples, replace=False) + + explainer = LimeTabularExplainer( + self.train, + feature_names=self.feature_names, + class_names=self.target_names, + training_labels=self.labels_train, + discretize_continuous=True, + discretizer="entropy", + ) + + explainer_partial = partial( + explainer.explain_instance, predict_fn=rf.predict_proba, num_features=2 + ) + with mp.Pool(mp.cpu_count() - 1) as p: + exp_parallel = p.map(explainer_partial, self.test[ind]) + + self.assertIsNotNone(exp_parallel) + self.assertEqual(len(ind), len(exp_parallel)) + self.assertTrue(all(isinstance(x, Explanation) for x in exp_parallel)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 812a1cbf04d510796cfe162d83b61a6e1c202f03 Mon Sep 17 00:00:00 2001 From: Sam Sharpe Date: Tue, 12 Oct 2021 15:13:12 -0400 Subject: [PATCH 2/2] parallel example --- ...al - parallel explanation generation.ipynb | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 doc/notebooks/Tutorial - parallel explanation generation.ipynb diff --git a/doc/notebooks/Tutorial - parallel explanation generation.ipynb b/doc/notebooks/Tutorial - parallel explanation generation.ipynb new file mode 100644 index 00000000..14ec5c9a --- /dev/null +++ b/doc/notebooks/Tutorial - parallel explanation generation.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "from functools import partial\n", + "\n", + "import sklearn\n", + "import sklearn.datasets\n", + "import sklearn.ensemble\n", + "import multiprocessing as mp\n", + "import numpy as np\n", + "import lime\n", + "import lime.lime_tabular\n", + "np.random.seed(1)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Loading data, training a model" + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "For this part, we'll use the Iris dataset, and we'll train a random forest. " + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "iris = sklearn.datasets.load_iris()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(iris.data, iris.target, train_size=0.80)\n" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)\n", + "rf.fit(train, labels_train)\n" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RandomForestClassifier(n_estimators=500)" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "sklearn.metrics.accuracy_score(labels_test, rf.predict(test))" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9666666666666667" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Create the explainer" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### Explaining multiple instances in parallel" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "source": [ + "%%time\n", + "explainer_partial = partial(\n", + " explainer.explain_instance, predict_fn=rf.predict_proba, num_features=2\n", + ")\n", + "with mp.Pool(mp.cpu_count() - 1) as p:\n", + " exp_parallel = p.map(explainer_partial, test[:20])\n", + "\n", + "print(exp_parallel)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[, , , , , , , , , , , , , , , , , , , ]\n", + "CPU times: user 443 ms, sys: 77.7 ms, total: 521 ms\n", + "Wall time: 5.51 s\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 18, + "source": [ + "exp_parallel[0].show_in_notebook(show_table=True, show_all=False)" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "source": [], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.7 64-bit ('.venv': venv)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "interpreter": { + "hash": "c5610125cf5e6a650be88d971b2a640487825e036e54247d3d2fb29abd9ffd91" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file