Support y_pred as pandas DataFrame for surrogate model predictions (#373

) * Support y_pred as pandas DataFrame for surrogate model predictions Signed-off-by: Gaurav Gupta <[email protected]> * Addressed code review comments Signed-off-by: Gaurav Gupta <[email protected]> * Regression fixes Signed-off-by: Gaurav Gupta <[email protected]> * Fix broken regression tests Signed-off-by: Gaurav Gupta <[email protected]> * Remove unneeded function Signed-off-by: Gaurav Gupta <[email protected]> * Fix flake8 errors Signed-off-by: Gaurav Gupta <[email protected]> * Fix regression case Signed-off-by: Gaurav Gupta <[email protected]>
interpretml · Feb 26, 2021 · c0e89f5 · c0e89f5
1 parent 670bd13
commit c0e89f5
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 12 deletions.
diff --git a/python/interpret_community/common/model_wrapper.py b/python/interpret_community/common/model_wrapper.py
@@ -205,6 +205,8 @@ def predict(self, dataset):
         if is_sequential or isinstance(self._model, WrappedPytorchModel):
             return self._model.predict_classes(dataset).flatten()
         preds = self._model.predict(dataset)
+        if isinstance(preds, pd.DataFrame):
+            preds = preds.values.ravel()
         # Handle possible case where the model has only a predict function and it outputs probabilities
         # Note this is different from WrappedClassificationWithoutProbaModel where there is no predict_proba
         # method but the predict method outputs classes
@@ -222,7 +224,11 @@ def predict_proba(self, dataset):
         :param dataset: The dataset to predict_proba on.
         :type dataset: DatasetWrapper
         """
-        return self._eval_function(dataset)
+        proba_preds = self._eval_function(dataset)
+        if isinstance(proba_preds, pd.DataFrame):
+            proba_preds = proba_preds.values
+
+        return proba_preds
 
 
 class WrappedRegressionModel(object):
@@ -239,7 +245,11 @@ def predict(self, dataset):
         :param dataset: The dataset to predict on.
         :type dataset: DatasetWrapper
         """
-        return self._eval_function(dataset)
+        preds = self._eval_function(dataset)
+        if isinstance(preds, pd.DataFrame):
+            preds = preds.values.ravel()
+
+        return preds
 
 
 class WrappedClassificationWithoutProbaModel(object):
@@ -416,6 +426,8 @@ def _eval_function(function, examples, model_task, wrapped=False):
         # to force the user to disambiguate the results.
         if result.shape[1] == 1:
             if model_task == ModelTask.Unknown:
+                if isinstance(result, pd.DataFrame):
+                    return (function, ModelTask.Regression)
                 raise Exception("Please specify model_task to disambiguate model type since "
                                 "result of calling function is 2D array of one column.")
             elif model_task == ModelTask.Classification:

diff --git a/test/common_utils.py b/test/common_utils.py
@@ -16,13 +16,19 @@
 from lightgbm import LGBMClassifier, LGBMRegressor
 from xgboost import XGBClassifier
 
-from tensorflow import keras
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense, Dropout, Activation
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
+try:
+    from tensorflow import keras
+    from tensorflow.keras.models import Sequential
+    from tensorflow.keras.layers import Dense, Dropout, Activation
+except ImportError:
+    pass
+
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+except ImportError:
+    pass
 
 from pandas import read_csv
 
@@ -405,6 +411,22 @@ def create_scikit_cancer_data():
     return x_train, x_test, y_train, y_test, feature_names, classes
 
 
+def create_binary_classification_dataset():
+    from sklearn.datasets import make_classification
+    import pandas as pd
+    import numpy as np
+    X, y = make_classification()
+
+    # Split data into train and test
+    x_train, x_test, y_train, y_test = train_test_split(X,
+                                                        y,
+                                                        test_size=0.2,
+                                                        random_state=0)
+    classes = np.unique(y_train).tolist()
+
+    return pd.DataFrame(x_train), y_train, pd.DataFrame(x_test), y_test, classes
+
+
 def create_reviews_data(test_size):
     reviews_data = retrieve_dataset('reviews.json')
     papers = reviews_data['paper']

diff --git a/test/models.py b/test/models.py
@@ -67,3 +67,44 @@ def predict_proba(self, X_pred):
         prediction = self.predict(X_pred).reshape(-1, 1)
         zeros = np.repeat(0, X_pred.shape[0]).reshape(-1, 1)
         return np.concatenate((1 - prediction, prediction, zeros), axis=1)
+
+
+class PredictAsDataFrameClassificationTestModel(object):
+    def __init__(self, model, return_predictions_as_dataframe=True):
+        self.return_predictions_as_dataframe = return_predictions_as_dataframe
+        self.model = model
+        pass
+
+    def fit(self, X, y):
+        pass
+
+    def predict(self, X_pred):
+        result = self.model.predict(X_pred)
+        if self.return_predictions_as_dataframe:
+            return pd.DataFrame(result)
+        else:
+            return result
+
+    def predict_proba(self, X_pred):
+        prediction = self.model.predict_proba(X_pred)
+        if self.return_predictions_as_dataframe:
+            return pd.DataFrame(prediction)
+        else:
+            return prediction
+
+
+class PredictAsDataFrameREgressionTestModel(object):
+    def __init__(self, model, return_predictions_as_dataframe=True):
+        self.return_predictions_as_dataframe = return_predictions_as_dataframe
+        self.model = model
+        pass
+
+    def fit(self, X, y):
+        pass
+
+    def predict(self, X_pred):
+        result = self.model.predict(X_pred)
+        if self.return_predictions_as_dataframe:
+            return pd.DataFrame(result)
+        else:
+            return result
diff --git a/test/test_mimic_explainer.py b/test/test_mimic_explainer.py
@@ -21,10 +21,14 @@
 from interpret_community.common.exception import ScenarioNotSupportedException
 from interpret_community.common.constants import ShapValuesOutput, ModelTask
 from interpret_community.mimic.models.lightgbm_model import LGBMExplainableModel
-from interpret_community.mimic.models.linear_model import LinearExplainableModel
+from interpret_community.mimic.models.linear_model import LinearExplainableModel, \
+    SGDExplainableModel
+from interpret_community.mimic.models.tree_model import DecisionTreeExplainableModel
 from common_utils import create_timeseries_data, LIGHTGBM_METHOD, \
-    LINEAR_METHOD, create_lightgbm_regressor
-from models import DataFrameTestModel, SkewedTestModel
+    LINEAR_METHOD, create_lightgbm_regressor, create_binary_classification_dataset, \
+    create_iris_data
+from models import DataFrameTestModel, SkewedTestModel, \
+    PredictAsDataFrameClassificationTestModel, PredictAsDataFrameREgressionTestModel
 from datasets import retrieve_dataset
 from sklearn import datasets
 import uuid
@@ -599,3 +603,71 @@ def iris_per_class_expected_features_special_args(self):
                 [['petal length', 'petal width', 'sepal width', 'sepal length'],
                  ['petal length', 'petal width', 'sepal width', 'sepal length'],
                  ['petal length', 'petal width', 'sepal width', 'sepal length']]]
+
+
+@pytest.mark.owner(email=owner_email_tools_and_ux)
+@pytest.mark.usefixtures('clean_dir')
+class TestMimicExplainerWrappedModels(object):
+    def test_working(self):
+        assert True
+
+    @pytest.mark.parametrize('if_predictions_as_dataframe', [True, False])
+    @pytest.mark.parametrize('explainable_model', [LGBMExplainableModel,
+                                                   LinearExplainableModel,
+                                                   DecisionTreeExplainableModel,
+                                                   SGDExplainableModel])
+    def test_explain_model_binary_classification_with_different_format_predictions(
+            self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
+        x_train, y_train, x_test, y_test, classes = create_binary_classification_dataset()
+        model = LogisticRegression(random_state=42).fit(x_train, y_train)
+        model.fit(x_train, y_train)
+
+        model = PredictAsDataFrameClassificationTestModel(
+            model, return_predictions_as_dataframe=if_predictions_as_dataframe)
+        kwargs = {}
+        explainer = mimic_explainer(model, x_train, explainable_model, **kwargs)
+        global_explanation = explainer.explain_global(evaluation_examples=x_test)
+        assert global_explanation is not None
+
+    @pytest.mark.parametrize('if_predictions_as_dataframe', [True, False])
+    @pytest.mark.parametrize('explainable_model', [LGBMExplainableModel,
+                                                   LinearExplainableModel,
+                                                   DecisionTreeExplainableModel,
+                                                   SGDExplainableModel])
+    def test_explain_model_multiclass_classification_with_different_format_predictions(
+            self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
+        x_train, x_test, y_train, y_test, _, classes = create_iris_data()
+        model = LogisticRegression(random_state=42).fit(x_train, y_train)
+        model.fit(x_train, y_train)
+
+        model = PredictAsDataFrameClassificationTestModel(
+            model, return_predictions_as_dataframe=if_predictions_as_dataframe)
+
+        kwargs = {}
+        explainer = mimic_explainer(model, x_train, explainable_model, **kwargs)
+        global_explanation = explainer.explain_global(evaluation_examples=x_test)
+        assert global_explanation is not None
+
+    @pytest.mark.parametrize('if_predictions_as_dataframe', [True, False])
+    @pytest.mark.parametrize('explainable_model', [LGBMExplainableModel,
+                                                   LinearExplainableModel,
+                                                   DecisionTreeExplainableModel,
+                                                   SGDExplainableModel])
+    def test_explain_model_regression_with_different_format_predictions(
+            self, mimic_explainer, if_predictions_as_dataframe, explainable_model):
+        num_features = 3
+        x_train = np.array([['a', 'E', 'x'], ['c', 'D', 'y']])
+        y_train = np.array([1, 2])
+        lin = LinearRegression(normalize=True)
+        one_hot_transformer = Pipeline(steps=[('one-hot', OneHotEncoder())])
+        transformations = [(list(range(num_features)), one_hot_transformer)]
+        clf = Pipeline(steps=[('preprocessor', one_hot_transformer), ('regressor', lin)])
+        model = clf.fit(x_train, y_train)
+        model = PredictAsDataFrameREgressionTestModel(model.named_steps['regressor'],
+                                                      if_predictions_as_dataframe)
+        explainable_model = explainable_model
+        explainer = mimic_explainer(model, x_train, explainable_model,
+                                    transformations=transformations, augment_data=False,
+                                    explainable_model_args={}, features=['f1', 'f2', 'f3'])
+        global_explanation = explainer.explain_global(x_train)
+        global_explanation is not None