Consider binary case as a subcase of categorical

mind-inria · Feb 20, 2025 · 302f5c0 · 302f5c0
1 parent c325bcc
commit 302f5c0
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 39 deletions.
diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py
@@ -119,7 +119,7 @@
     cpi = CPI(
         estimator=regressor_list[i],
         imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
-        imputation_model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
+        imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
         # covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
         n_permutations=50,
         random_state=0,

diff --git a/src/hidimstat/conditional_permutation_importance.py b/src/hidimstat/conditional_permutation_importance.py
@@ -16,7 +16,6 @@ def __init__(
         n_jobs: int = 1,
         n_permutations: int = 50,
         imputation_model_continuous=None,
-        imputation_model_binary=None,
         imputation_model_categorical=None,
         random_state: int = None,
         categorical_max_cardinality: int = 10,
@@ -46,12 +45,10 @@ def __init__(
         imputation_model_continuous : sklearn compatible estimator, optional
             The model used to estimate the conditional distribution of a given
             continuous variable/group of variables given the others.
-        imputation_model_binary : sklearn compatible estimator, optional
-            The model used to estimate the conditional distribution of a given
-            binary variable/group of variables given the others.
         imputation_model_categorical : sklearn compatible estimator, optional
             The model used to estimate the conditional distribution of a given
-            categorical variable/group of variables given the others.
+            categorical variable/group of variables given the others. Binary is
+            considered as a special case of categorical.
         random_state : int, default=None
             The random state to use for sampling.
         categorical_max_cardinality : int, default=10
@@ -71,7 +68,6 @@ def __init__(
 
         self.imputation_model = {
             "continuous": imputation_model_continuous,
-            "binary": imputation_model_binary,
             "categorical": imputation_model_categorical,
         }
         self.categorical_max_cardinality = categorical_max_cardinality
@@ -92,11 +88,6 @@ def fit(self, X, y=None, groups=None, var_type="auto"):
                     if self.imputation_model["continuous"] is None
                     else clone(self.imputation_model["continuous"])
                 ),
-                model_binary=(
-                    None
-                    if self.imputation_model["binary"] is None
-                    else clone(self.imputation_model["binary"])
-                ),
                 model_categorical=(
                     None
                     if self.imputation_model["categorical"] is None

diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/conditional_sampling.py
@@ -8,7 +8,6 @@ class ConditionalSampler:
     def __init__(
         self,
         model_regression=None,
-        model_binary=None,
         model_categorical=None,
         data_type: str = "auto",
         random_state=None,
@@ -21,15 +20,14 @@ def __init__(
         ----------
         model_regression : sklearn compatible estimator, optional
             The model to use for continuous data.
-        model_binary : sklearn compatible estimator, optional
-            The model to use for binary data.
         model_categorical : sklearn compatible estimator, optional
-            The model to use for categorical data.
+            The model to use for categorical data. Binary is considered as a special
+            case of categorical data.
         data_type : str, default="auto"
-            The variable type. Supported types include "auto", "continuous", "binary",
-            and "categorical". If "auto", the type is inferred from the cardinality of
-            the unique values passed to the `fit` method. For categorical variables, the
-            default strategy is to use a one-vs-rest classifier.
+            The variable type. Supported types include "auto", "continuous", and
+            "categorical". If "auto", the type is inferred from the cardinality
+            of the unique values passed to the `fit` method. For categorical variables,
+            the default strategy is to use a one-vs-rest classifier.
         random_state : int, optional
             The random state to use for sampling.
         categorical_max_cardinality : int, default=10
@@ -39,19 +37,15 @@ def __init__(
         """
         self.data_type = data_type
         self.model_regression = model_regression
-        self.model_binary = model_binary
         self.model_categorical = model_categorical
 
         if data_type == "auto":
             self.model_auto = {
                 "continuous": model_regression,
-                "binary": model_binary,
                 "categorical": model_categorical,
             }
         elif data_type == "continuous":
             self.model = model_regression
-        elif data_type == "binary":
-            self.model = model_binary
         elif data_type == "categorical":
             self.model = model_categorical
         else:
@@ -72,10 +66,7 @@ def fit(self, X: np.ndarray, y: np.ndarray):
         """
 
         if self.data_type == "auto":
-            if len(np.unique(y)) == 2:
-                self.data_type = "binary"
-                self.model = self.model_auto["binary"]
-            elif len(np.unique(y)) <= self.categorical_max_cardinality:
+            if len(np.unique(y)) <= self.categorical_max_cardinality:
                 self.data_type = "categorical"
                 self.model = self.model_auto["categorical"]
             else:
@@ -84,7 +75,7 @@ def fit(self, X: np.ndarray, y: np.ndarray):
 
         # Group of variables
         if (y.ndim > 1) and (y.shape[1] > 1):
-            if self.data_type in ["binary", "categorical"]:
+            if self.data_type == "categorical":
                 self.model = MultiOutputClassifier(self.model)
             elif self.data_type == "continuous" and not issubclass(
                 self.model.__class__, MultiOutputMixin
@@ -130,7 +121,7 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray
             )
             return y_hat[np.newaxis, ...] + residual_permuted
 
-        elif self.data_type in ["binary", "categorical"]:
+        elif self.data_type == "categorical":
             if not hasattr(self.model, "predict_proba"):
                 raise AttributeError(
                     "The model must have a `predict_proba` method to be used for \

diff --git a/test/test_conditional_sampling.py b/test/test_conditional_sampling.py
@@ -53,8 +53,8 @@ def test_binary_case():
     np.random.seed(40)
 
     sampler = ConditionalSampler(
-        model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
-        data_type="binary",
+        model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
+        data_type="categorical",
         random_state=0,
     )
 
@@ -78,9 +78,8 @@ def test_binary_case():
         assert accuracy_score(X_1_perm[i], X[:, 1]) < 0.6
 
     sampler = ConditionalSampler(
-        model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
         model_regression=RidgeCV(alphas=np.logspace(-2, 2, 10)),
-        model_categorical=None,
+        model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
         data_type="auto",
         random_state=0,
     )
@@ -99,7 +98,7 @@ def test_error():
     np.random.seed(40)
     sampler = ConditionalSampler(
         model_regression=RidgeClassifier(),
-        data_type="binary",
+        data_type="categorical",
         random_state=0,
     )
     X = np.random.randint(0, 2, size=(100, 2))
@@ -114,7 +113,7 @@ def test_error():
     )
     with pytest.raises(AttributeError):
         sampler.fit(np.delete(X, 1, axis=1), X[:, 1])
-        sampler.sample()
+        sampler.sample(np.delete(X, 1, axis=1), X[:, 1])
 
     sampler = ConditionalSampler(
         data_type="auto",
@@ -161,8 +160,8 @@ def test_group_case():
     X[:, 4] = 2 * X[:, 1] - 1 + np.random.randn(X.shape[0]) * 0.3 > 0
     model = LogisticRegressionCV(Cs=np.logspace(-2, 2, 10))
     sampler = ConditionalSampler(
-        model_binary=model,
-        data_type="binary",
+        model_categorical=model,
+        data_type="categorical",
         random_state=0,
     )
     sampler.fit(X[:, :3], X[:, 3:])

diff --git a/test/test_cpi.py b/test/test_cpi.py
@@ -25,7 +25,7 @@ def test_cpi(linear_scenario):
     cpi = CPI(
         estimator=regression_model,
         imputation_model_continuous=clone(imputation_model),
-        imputation_model_binary=LogisticRegression(),
+        imputation_model_categorical=LogisticRegression(),
         n_permutations=20,
         method="predict",
         random_state=0,