Skip to content

Commit

Permalink
Support y_pred as pandas DataFrame for surrogate model predictions (#373
Browse files Browse the repository at this point in the history
)

* Support y_pred as pandas DataFrame for surrogate model predictions

Signed-off-by: Gaurav Gupta <[email protected]>

* Addressed code review comments

Signed-off-by: Gaurav Gupta <[email protected]>

* Regression fixes

Signed-off-by: Gaurav Gupta <[email protected]>

* Fix broken regression tests

Signed-off-by: Gaurav Gupta <[email protected]>

* Remove unneeded function

Signed-off-by: Gaurav Gupta <[email protected]>

* Fix flake8 errors

Signed-off-by: Gaurav Gupta <[email protected]>

* Fix regression case

Signed-off-by: Gaurav Gupta <[email protected]>
  • Loading branch information
gaugup authored Feb 26, 2021
1 parent 670bd13 commit c0e89f5
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 12 deletions.
16 changes: 14 additions & 2 deletions python/interpret_community/common/model_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def predict(self, dataset):
if is_sequential or isinstance(self._model, WrappedPytorchModel):
return self._model.predict_classes(dataset).flatten()
preds = self._model.predict(dataset)
if isinstance(preds, pd.DataFrame):
preds = preds.values.ravel()
# Handle possible case where the model has only a predict function and it outputs probabilities
# Note this is different from WrappedClassificationWithoutProbaModel where there is no predict_proba
# method but the predict method outputs classes
Expand All @@ -222,7 +224,11 @@ def predict_proba(self, dataset):
:param dataset: The dataset to predict_proba on.
:type dataset: DatasetWrapper
"""
return self._eval_function(dataset)
proba_preds = self._eval_function(dataset)
if isinstance(proba_preds, pd.DataFrame):
proba_preds = proba_preds.values

return proba_preds


class WrappedRegressionModel(object):
Expand All @@ -239,7 +245,11 @@ def predict(self, dataset):
:param dataset: The dataset to predict on.
:type dataset: DatasetWrapper
"""
return self._eval_function(dataset)
preds = self._eval_function(dataset)
if isinstance(preds, pd.DataFrame):
preds = preds.values.ravel()

return preds


class WrappedClassificationWithoutProbaModel(object):
Expand Down Expand Up @@ -416,6 +426,8 @@ def _eval_function(function, examples, model_task, wrapped=False):
# to force the user to disambiguate the results.
if result.shape[1] == 1:
if model_task == ModelTask.Unknown:
if isinstance(result, pd.DataFrame):
return (function, ModelTask.Regression)
raise Exception("Please specify model_task to disambiguate model type since "
"result of calling function is 2D array of one column.")
elif model_task == ModelTask.Classification:
Expand Down
36 changes: 29 additions & 7 deletions test/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation

import torch
import torch.nn as nn
import torch.nn.functional as F
try:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
except ImportError:
pass

try:
import torch
import torch.nn as nn
import torch.nn.functional as F
except ImportError:
pass

from pandas import read_csv

Expand Down Expand Up @@ -405,6 +411,22 @@ def create_scikit_cancer_data():
return x_train, x_test, y_train, y_test, feature_names, classes


def create_binary_classification_dataset():
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np
X, y = make_classification()

# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
classes = np.unique(y_train).tolist()

return pd.DataFrame(x_train), y_train, pd.DataFrame(x_test), y_test, classes


def create_reviews_data(test_size):
reviews_data = retrieve_dataset('reviews.json')
papers = reviews_data['paper']
Expand Down
41 changes: 41 additions & 0 deletions test/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,44 @@ def predict_proba(self, X_pred):
prediction = self.predict(X_pred).reshape(-1, 1)
zeros = np.repeat(0, X_pred.shape[0]).reshape(-1, 1)
return np.concatenate((1 - prediction, prediction, zeros), axis=1)


class PredictAsDataFrameClassificationTestModel(object):
def __init__(self, model, return_predictions_as_dataframe=True):
self.return_predictions_as_dataframe = return_predictions_as_dataframe
self.model = model
pass

def fit(self, X, y):
pass

def predict(self, X_pred):
result = self.model.predict(X_pred)
if self.return_predictions_as_dataframe:
return pd.DataFrame(result)
else:
return result

def predict_proba(self, X_pred):
prediction = self.model.predict_proba(X_pred)
if self.return_predictions_as_dataframe:
return pd.DataFrame(prediction)
else:
return prediction


class PredictAsDataFrameREgressionTestModel(object):
def __init__(self, model, return_predictions_as_dataframe=True):
self.return_predictions_as_dataframe = return_predictions_as_dataframe
self.model = model
pass

def fit(self, X, y):
pass

def predict(self, X_pred):
result = self.model.predict(X_pred)
if self.return_predictions_as_dataframe:
return pd.DataFrame(result)
else:
return result
78 changes: 75 additions & 3 deletions test/test_mimic_explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
from interpret_community.common.exception import ScenarioNotSupportedException
from interpret_community.common.constants import ShapValuesOutput, ModelTask
from interpret_community.mimic.models.lightgbm_model import LGBMExplainableModel
from interpret_community.mimic.models.linear_model import LinearExplainableModel
from interpret_community.mimic.models.linear_model import LinearExplainableModel, \
SGDExplainableModel
from interpret_community.mimic.models.tree_model import DecisionTreeExplainableModel
from common_utils import create_timeseries_data, LIGHTGBM_METHOD, \
LINEAR_METHOD, create_lightgbm_regressor
from models import DataFrameTestModel, SkewedTestModel
LINEAR_METHOD, create_lightgbm_regressor, create_binary_classification_dataset, \
create_iris_data
from models import DataFrameTestModel, SkewedTestModel, \
PredictAsDataFrameClassificationTestModel, PredictAsDataFrameREgressionTestModel
from datasets import retrieve_dataset
from sklearn import datasets
import uuid
Expand Down Expand Up @@ -599,3 +603,71 @@ def iris_per_class_expected_features_special_args(self):
[['petal length', 'petal width', 'sepal width', 'sepal length'],
['petal length', 'petal width', 'sepal width', 'sepal length'],
['petal length', 'petal width', 'sepal width', 'sepal length']]]


@pytest.mark.owner(email=owner_email_tools_and_ux)
@pytest.mark.usefixtures('clean_dir')
class TestMimicExplainerWrappedModels(object):
def test_working(self):
assert True

@pytest.mark.parametrize('if_predictions_as_dataframe', [True, False])
@pytest.mark.parametrize('explainable_model', [LGBMExplainableModel,
LinearExplainableModel,
DecisionTreeExplainableModel,
SGDExplainableModel])
def test_explain_model_binary_classification_with_different_format_predictions(
self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
x_train, y_train, x_test, y_test, classes = create_binary_classification_dataset()
model = LogisticRegression(random_state=42).fit(x_train, y_train)
model.fit(x_train, y_train)

model = PredictAsDataFrameClassificationTestModel(
model, return_predictions_as_dataframe=if_predictions_as_dataframe)
kwargs = {}
explainer = mimic_explainer(model, x_train, explainable_model, **kwargs)
global_explanation = explainer.explain_global(evaluation_examples=x_test)
assert global_explanation is not None

@pytest.mark.parametrize('if_predictions_as_dataframe', [True, False])
@pytest.mark.parametrize('explainable_model', [LGBMExplainableModel,
LinearExplainableModel,
DecisionTreeExplainableModel,
SGDExplainableModel])
def test_explain_model_multiclass_classification_with_different_format_predictions(
self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
x_train, x_test, y_train, y_test, _, classes = create_iris_data()
model = LogisticRegression(random_state=42).fit(x_train, y_train)
model.fit(x_train, y_train)

model = PredictAsDataFrameClassificationTestModel(
model, return_predictions_as_dataframe=if_predictions_as_dataframe)

kwargs = {}
explainer = mimic_explainer(model, x_train, explainable_model, **kwargs)
global_explanation = explainer.explain_global(evaluation_examples=x_test)
assert global_explanation is not None

@pytest.mark.parametrize('if_predictions_as_dataframe', [True, False])
@pytest.mark.parametrize('explainable_model', [LGBMExplainableModel,
LinearExplainableModel,
DecisionTreeExplainableModel,
SGDExplainableModel])
def test_explain_model_regression_with_different_format_predictions(
self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
num_features = 3
x_train = np.array([['a', 'E', 'x'], ['c', 'D', 'y']])
y_train = np.array([1, 2])
lin = LinearRegression(normalize=True)
one_hot_transformer = Pipeline(steps=[('one-hot', OneHotEncoder())])
transformations = [(list(range(num_features)), one_hot_transformer)]
clf = Pipeline(steps=[('preprocessor', one_hot_transformer), ('regressor', lin)])
model = clf.fit(x_train, y_train)
model = PredictAsDataFrameREgressionTestModel(model.named_steps['regressor'],
if_predictions_as_dataframe)
explainable_model = explainable_model
explainer = mimic_explainer(model, x_train, explainable_model,
transformations=transformations, augment_data=False,
explainable_model_args={}, features=['f1', 'f2', 'f3'])
global_explanation = explainer.explain_global(x_train)
global_explanation is not None

0 comments on commit c0e89f5

Please sign in to comment.