diff --git a/python/interpret_community/common/model_wrapper.py b/python/interpret_community/common/model_wrapper.py index 4b6207db..09058459 100644 --- a/python/interpret_community/common/model_wrapper.py +++ b/python/interpret_community/common/model_wrapper.py @@ -205,6 +205,8 @@ def predict(self, dataset): if is_sequential or isinstance(self._model, WrappedPytorchModel): return self._model.predict_classes(dataset).flatten() preds = self._model.predict(dataset) + if isinstance(preds, pd.DataFrame): + preds = preds.values.ravel() # Handle possible case where the model has only a predict function and it outputs probabilities # Note this is different from WrappedClassificationWithoutProbaModel where there is no predict_proba # method but the predict method outputs classes @@ -222,7 +224,11 @@ def predict_proba(self, dataset): :param dataset: The dataset to predict_proba on. :type dataset: DatasetWrapper """ - return self._eval_function(dataset) + proba_preds = self._eval_function(dataset) + if isinstance(proba_preds, pd.DataFrame): + proba_preds = proba_preds.values + + return proba_preds class WrappedRegressionModel(object): @@ -239,7 +245,11 @@ def predict(self, dataset): :param dataset: The dataset to predict on. :type dataset: DatasetWrapper """ - return self._eval_function(dataset) + preds = self._eval_function(dataset) + if isinstance(preds, pd.DataFrame): + preds = preds.values.ravel() + + return preds class WrappedClassificationWithoutProbaModel(object): @@ -416,6 +426,8 @@ def _eval_function(function, examples, model_task, wrapped=False): # to force the user to disambiguate the results. if result.shape[1] == 1: if model_task == ModelTask.Unknown: + if isinstance(result, pd.DataFrame): + return (function, ModelTask.Regression) raise Exception("Please specify model_task to disambiguate model type since " "result of calling function is 2D array of one column.") elif model_task == ModelTask.Classification: diff --git a/test/common_utils.py b/test/common_utils.py index 73d2fa52..6408a01c 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -16,13 +16,19 @@ from lightgbm import LGBMClassifier, LGBMRegressor from xgboost import XGBClassifier -from tensorflow import keras -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense, Dropout, Activation - -import torch -import torch.nn as nn -import torch.nn.functional as F +try: + from tensorflow import keras + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import Dense, Dropout, Activation +except ImportError: + pass + +try: + import torch + import torch.nn as nn + import torch.nn.functional as F +except ImportError: + pass from pandas import read_csv @@ -405,6 +411,22 @@ def create_scikit_cancer_data(): return x_train, x_test, y_train, y_test, feature_names, classes +def create_binary_classification_dataset(): + from sklearn.datasets import make_classification + import pandas as pd + import numpy as np + X, y = make_classification() + + # Split data into train and test + x_train, x_test, y_train, y_test = train_test_split(X, + y, + test_size=0.2, + random_state=0) + classes = np.unique(y_train).tolist() + + return pd.DataFrame(x_train), y_train, pd.DataFrame(x_test), y_test, classes + + def create_reviews_data(test_size): reviews_data = retrieve_dataset('reviews.json') papers = reviews_data['paper'] diff --git a/test/models.py b/test/models.py index 954b0af6..5b3396e8 100644 --- a/test/models.py +++ b/test/models.py @@ -67,3 +67,44 @@ def predict_proba(self, X_pred): prediction = self.predict(X_pred).reshape(-1, 1) zeros = np.repeat(0, X_pred.shape[0]).reshape(-1, 1) return np.concatenate((1 - prediction, prediction, zeros), axis=1) + + +class PredictAsDataFrameClassificationTestModel(object): + def __init__(self, model, return_predictions_as_dataframe=True): + self.return_predictions_as_dataframe = return_predictions_as_dataframe + self.model = model + pass + + def fit(self, X, y): + pass + + def predict(self, X_pred): + result = self.model.predict(X_pred) + if self.return_predictions_as_dataframe: + return pd.DataFrame(result) + else: + return result + + def predict_proba(self, X_pred): + prediction = self.model.predict_proba(X_pred) + if self.return_predictions_as_dataframe: + return pd.DataFrame(prediction) + else: + return prediction + + +class PredictAsDataFrameREgressionTestModel(object): + def __init__(self, model, return_predictions_as_dataframe=True): + self.return_predictions_as_dataframe = return_predictions_as_dataframe + self.model = model + pass + + def fit(self, X, y): + pass + + def predict(self, X_pred): + result = self.model.predict(X_pred) + if self.return_predictions_as_dataframe: + return pd.DataFrame(result) + else: + return result diff --git a/test/test_mimic_explainer.py b/test/test_mimic_explainer.py index 0684396f..9e5b0307 100644 --- a/test/test_mimic_explainer.py +++ b/test/test_mimic_explainer.py @@ -21,10 +21,14 @@ from interpret_community.common.exception import ScenarioNotSupportedException from interpret_community.common.constants import ShapValuesOutput, ModelTask from interpret_community.mimic.models.lightgbm_model import LGBMExplainableModel -from interpret_community.mimic.models.linear_model import LinearExplainableModel +from interpret_community.mimic.models.linear_model import LinearExplainableModel, \ + SGDExplainableModel +from interpret_community.mimic.models.tree_model import DecisionTreeExplainableModel from common_utils import create_timeseries_data, LIGHTGBM_METHOD, \ - LINEAR_METHOD, create_lightgbm_regressor -from models import DataFrameTestModel, SkewedTestModel + LINEAR_METHOD, create_lightgbm_regressor, create_binary_classification_dataset, \ + create_iris_data +from models import DataFrameTestModel, SkewedTestModel, \ + PredictAsDataFrameClassificationTestModel, PredictAsDataFrameREgressionTestModel from datasets import retrieve_dataset from sklearn import datasets import uuid @@ -599,3 +603,71 @@ def iris_per_class_expected_features_special_args(self): [['petal length', 'petal width', 'sepal width', 'sepal length'], ['petal length', 'petal width', 'sepal width', 'sepal length'], ['petal length', 'petal width', 'sepal width', 'sepal length']]] + + +@pytest.mark.owner(email=owner_email_tools_and_ux) +@pytest.mark.usefixtures('clean_dir') +class TestMimicExplainerWrappedModels(object): + def test_working(self): + assert True + + @pytest.mark.parametrize('if_predictions_as_dataframe', [True, False]) + @pytest.mark.parametrize('explainable_model', [LGBMExplainableModel, + LinearExplainableModel, + DecisionTreeExplainableModel, + SGDExplainableModel]) + def test_explain_model_binary_classification_with_different_format_predictions( + self, mimic_explainer, if_predictions_as_dataframe, explainable_model): + x_train, y_train, x_test, y_test, classes = create_binary_classification_dataset() + model = LogisticRegression(random_state=42).fit(x_train, y_train) + model.fit(x_train, y_train) + + model = PredictAsDataFrameClassificationTestModel( + model, return_predictions_as_dataframe=if_predictions_as_dataframe) + kwargs = {} + explainer = mimic_explainer(model, x_train, explainable_model, **kwargs) + global_explanation = explainer.explain_global(evaluation_examples=x_test) + assert global_explanation is not None + + @pytest.mark.parametrize('if_predictions_as_dataframe', [True, False]) + @pytest.mark.parametrize('explainable_model', [LGBMExplainableModel, + LinearExplainableModel, + DecisionTreeExplainableModel, + SGDExplainableModel]) + def test_explain_model_multiclass_classification_with_different_format_predictions( + self, mimic_explainer, if_predictions_as_dataframe, explainable_model): + x_train, x_test, y_train, y_test, _, classes = create_iris_data() + model = LogisticRegression(random_state=42).fit(x_train, y_train) + model.fit(x_train, y_train) + + model = PredictAsDataFrameClassificationTestModel( + model, return_predictions_as_dataframe=if_predictions_as_dataframe) + + kwargs = {} + explainer = mimic_explainer(model, x_train, explainable_model, **kwargs) + global_explanation = explainer.explain_global(evaluation_examples=x_test) + assert global_explanation is not None + + @pytest.mark.parametrize('if_predictions_as_dataframe', [True, False]) + @pytest.mark.parametrize('explainable_model', [LGBMExplainableModel, + LinearExplainableModel, + DecisionTreeExplainableModel, + SGDExplainableModel]) + def test_explain_model_regression_with_different_format_predictions( + self, mimic_explainer, if_predictions_as_dataframe, explainable_model): + num_features = 3 + x_train = np.array([['a', 'E', 'x'], ['c', 'D', 'y']]) + y_train = np.array([1, 2]) + lin = LinearRegression(normalize=True) + one_hot_transformer = Pipeline(steps=[('one-hot', OneHotEncoder())]) + transformations = [(list(range(num_features)), one_hot_transformer)] + clf = Pipeline(steps=[('preprocessor', one_hot_transformer), ('regressor', lin)]) + model = clf.fit(x_train, y_train) + model = PredictAsDataFrameREgressionTestModel(model.named_steps['regressor'], + if_predictions_as_dataframe) + explainable_model = explainable_model + explainer = mimic_explainer(model, x_train, explainable_model, + transformations=transformations, augment_data=False, + explainable_model_args={}, features=['f1', 'f2', 'f3']) + global_explanation = explainer.explain_global(x_train) + global_explanation is not None