diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 924f707..5292159 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -119,7 +119,7 @@ cpi = CPI( estimator=regressor_list[i], imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)), - imputation_model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), + imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), # covariate_estimator=HistGradientBoostingRegressor(random_state=0,), n_permutations=50, random_state=0, diff --git a/src/hidimstat/conditional_permutation_importance.py b/src/hidimstat/conditional_permutation_importance.py index f80cc44..39e7303 100644 --- a/src/hidimstat/conditional_permutation_importance.py +++ b/src/hidimstat/conditional_permutation_importance.py @@ -16,7 +16,6 @@ def __init__( n_jobs: int = 1, n_permutations: int = 50, imputation_model_continuous=None, - imputation_model_binary=None, imputation_model_categorical=None, random_state: int = None, categorical_max_cardinality: int = 10, @@ -46,12 +45,10 @@ def __init__( imputation_model_continuous : sklearn compatible estimator, optional The model used to estimate the conditional distribution of a given continuous variable/group of variables given the others. - imputation_model_binary : sklearn compatible estimator, optional - The model used to estimate the conditional distribution of a given - binary variable/group of variables given the others. imputation_model_categorical : sklearn compatible estimator, optional The model used to estimate the conditional distribution of a given - categorical variable/group of variables given the others. + categorical variable/group of variables given the others. Binary is + considered as a special case of categorical. random_state : int, default=None The random state to use for sampling. categorical_max_cardinality : int, default=10 @@ -71,7 +68,6 @@ def __init__( self.imputation_model = { "continuous": imputation_model_continuous, - "binary": imputation_model_binary, "categorical": imputation_model_categorical, } self.categorical_max_cardinality = categorical_max_cardinality @@ -92,11 +88,6 @@ def fit(self, X, y=None, groups=None, var_type="auto"): if self.imputation_model["continuous"] is None else clone(self.imputation_model["continuous"]) ), - model_binary=( - None - if self.imputation_model["binary"] is None - else clone(self.imputation_model["binary"]) - ), model_categorical=( None if self.imputation_model["categorical"] is None diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/conditional_sampling.py index 0fd31dd..4a79f12 100644 --- a/src/hidimstat/conditional_sampling.py +++ b/src/hidimstat/conditional_sampling.py @@ -8,7 +8,6 @@ class ConditionalSampler: def __init__( self, model_regression=None, - model_binary=None, model_categorical=None, data_type: str = "auto", random_state=None, @@ -21,15 +20,14 @@ def __init__( ---------- model_regression : sklearn compatible estimator, optional The model to use for continuous data. - model_binary : sklearn compatible estimator, optional - The model to use for binary data. model_categorical : sklearn compatible estimator, optional - The model to use for categorical data. + The model to use for categorical data. Binary is considered as a special + case of categorical data. data_type : str, default="auto" - The variable type. Supported types include "auto", "continuous", "binary", - and "categorical". If "auto", the type is inferred from the cardinality of - the unique values passed to the `fit` method. For categorical variables, the - default strategy is to use a one-vs-rest classifier. + The variable type. Supported types include "auto", "continuous", and + "categorical". If "auto", the type is inferred from the cardinality + of the unique values passed to the `fit` method. For categorical variables, + the default strategy is to use a one-vs-rest classifier. random_state : int, optional The random state to use for sampling. categorical_max_cardinality : int, default=10 @@ -39,19 +37,15 @@ def __init__( """ self.data_type = data_type self.model_regression = model_regression - self.model_binary = model_binary self.model_categorical = model_categorical if data_type == "auto": self.model_auto = { "continuous": model_regression, - "binary": model_binary, "categorical": model_categorical, } elif data_type == "continuous": self.model = model_regression - elif data_type == "binary": - self.model = model_binary elif data_type == "categorical": self.model = model_categorical else: @@ -72,10 +66,7 @@ def fit(self, X: np.ndarray, y: np.ndarray): """ if self.data_type == "auto": - if len(np.unique(y)) == 2: - self.data_type = "binary" - self.model = self.model_auto["binary"] - elif len(np.unique(y)) <= self.categorical_max_cardinality: + if len(np.unique(y)) <= self.categorical_max_cardinality: self.data_type = "categorical" self.model = self.model_auto["categorical"] else: @@ -84,7 +75,7 @@ def fit(self, X: np.ndarray, y: np.ndarray): # Group of variables if (y.ndim > 1) and (y.shape[1] > 1): - if self.data_type in ["binary", "categorical"]: + if self.data_type == "categorical": self.model = MultiOutputClassifier(self.model) elif self.data_type == "continuous" and not issubclass( self.model.__class__, MultiOutputMixin @@ -130,7 +121,7 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray ) return y_hat[np.newaxis, ...] + residual_permuted - elif self.data_type in ["binary", "categorical"]: + elif self.data_type == "categorical": if not hasattr(self.model, "predict_proba"): raise AttributeError( "The model must have a `predict_proba` method to be used for \ diff --git a/test/test_conditional_sampling.py b/test/test_conditional_sampling.py index 5ab5cc0..183d2c2 100644 --- a/test/test_conditional_sampling.py +++ b/test/test_conditional_sampling.py @@ -53,8 +53,8 @@ def test_binary_case(): np.random.seed(40) sampler = ConditionalSampler( - model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), - data_type="binary", + model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), + data_type="categorical", random_state=0, ) @@ -78,9 +78,8 @@ def test_binary_case(): assert accuracy_score(X_1_perm[i], X[:, 1]) < 0.6 sampler = ConditionalSampler( - model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), model_regression=RidgeCV(alphas=np.logspace(-2, 2, 10)), - model_categorical=None, + model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), data_type="auto", random_state=0, ) @@ -99,7 +98,7 @@ def test_error(): np.random.seed(40) sampler = ConditionalSampler( model_regression=RidgeClassifier(), - data_type="binary", + data_type="categorical", random_state=0, ) X = np.random.randint(0, 2, size=(100, 2)) @@ -114,7 +113,7 @@ def test_error(): ) with pytest.raises(AttributeError): sampler.fit(np.delete(X, 1, axis=1), X[:, 1]) - sampler.sample() + sampler.sample(np.delete(X, 1, axis=1), X[:, 1]) sampler = ConditionalSampler( data_type="auto", @@ -161,8 +160,8 @@ def test_group_case(): X[:, 4] = 2 * X[:, 1] - 1 + np.random.randn(X.shape[0]) * 0.3 > 0 model = LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)) sampler = ConditionalSampler( - model_binary=model, - data_type="binary", + model_categorical=model, + data_type="categorical", random_state=0, ) sampler.fit(X[:, :3], X[:, 3:]) diff --git a/test/test_cpi.py b/test/test_cpi.py index c2ff43b..9d7063a 100644 --- a/test/test_cpi.py +++ b/test/test_cpi.py @@ -25,7 +25,7 @@ def test_cpi(linear_scenario): cpi = CPI( estimator=regression_model, imputation_model_continuous=clone(imputation_model), - imputation_model_binary=LogisticRegression(), + imputation_model_categorical=LogisticRegression(), n_permutations=20, method="predict", random_state=0,