Skip to content

Commit

Permalink
Consider binary case as a subcase of categorical
Browse files Browse the repository at this point in the history
  • Loading branch information
jpaillard committed Feb 20, 2025
1 parent c325bcc commit 302f5c0
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 39 deletions.
2 changes: 1 addition & 1 deletion examples/plot_diabetes_variable_importance_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
cpi = CPI(
estimator=regressor_list[i],
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
imputation_model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
# covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
n_permutations=50,
random_state=0,
Expand Down
13 changes: 2 additions & 11 deletions src/hidimstat/conditional_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def __init__(
n_jobs: int = 1,
n_permutations: int = 50,
imputation_model_continuous=None,
imputation_model_binary=None,
imputation_model_categorical=None,
random_state: int = None,
categorical_max_cardinality: int = 10,
Expand Down Expand Up @@ -46,12 +45,10 @@ def __init__(
imputation_model_continuous : sklearn compatible estimator, optional
The model used to estimate the conditional distribution of a given
continuous variable/group of variables given the others.
imputation_model_binary : sklearn compatible estimator, optional
The model used to estimate the conditional distribution of a given
binary variable/group of variables given the others.
imputation_model_categorical : sklearn compatible estimator, optional
The model used to estimate the conditional distribution of a given
categorical variable/group of variables given the others.
categorical variable/group of variables given the others. Binary is
considered as a special case of categorical.
random_state : int, default=None
The random state to use for sampling.
categorical_max_cardinality : int, default=10
Expand All @@ -71,7 +68,6 @@ def __init__(

self.imputation_model = {
"continuous": imputation_model_continuous,
"binary": imputation_model_binary,
"categorical": imputation_model_categorical,
}
self.categorical_max_cardinality = categorical_max_cardinality
Expand All @@ -92,11 +88,6 @@ def fit(self, X, y=None, groups=None, var_type="auto"):
if self.imputation_model["continuous"] is None
else clone(self.imputation_model["continuous"])
),
model_binary=(
None
if self.imputation_model["binary"] is None
else clone(self.imputation_model["binary"])
),
model_categorical=(
None
if self.imputation_model["categorical"] is None
Expand Down
27 changes: 9 additions & 18 deletions src/hidimstat/conditional_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class ConditionalSampler:
def __init__(
self,
model_regression=None,
model_binary=None,
model_categorical=None,
data_type: str = "auto",
random_state=None,
Expand All @@ -21,15 +20,14 @@ def __init__(
----------
model_regression : sklearn compatible estimator, optional
The model to use for continuous data.
model_binary : sklearn compatible estimator, optional
The model to use for binary data.
model_categorical : sklearn compatible estimator, optional
The model to use for categorical data.
The model to use for categorical data. Binary is considered as a special
case of categorical data.
data_type : str, default="auto"
The variable type. Supported types include "auto", "continuous", "binary",
and "categorical". If "auto", the type is inferred from the cardinality of
the unique values passed to the `fit` method. For categorical variables, the
default strategy is to use a one-vs-rest classifier.
The variable type. Supported types include "auto", "continuous", and
"categorical". If "auto", the type is inferred from the cardinality
of the unique values passed to the `fit` method. For categorical variables,
the default strategy is to use a one-vs-rest classifier.
random_state : int, optional
The random state to use for sampling.
categorical_max_cardinality : int, default=10
Expand All @@ -39,19 +37,15 @@ def __init__(
"""
self.data_type = data_type
self.model_regression = model_regression
self.model_binary = model_binary
self.model_categorical = model_categorical

if data_type == "auto":
self.model_auto = {
"continuous": model_regression,
"binary": model_binary,
"categorical": model_categorical,
}
elif data_type == "continuous":
self.model = model_regression
elif data_type == "binary":
self.model = model_binary
elif data_type == "categorical":
self.model = model_categorical
else:
Expand All @@ -72,10 +66,7 @@ def fit(self, X: np.ndarray, y: np.ndarray):
"""

if self.data_type == "auto":
if len(np.unique(y)) == 2:
self.data_type = "binary"
self.model = self.model_auto["binary"]
elif len(np.unique(y)) <= self.categorical_max_cardinality:
if len(np.unique(y)) <= self.categorical_max_cardinality:
self.data_type = "categorical"
self.model = self.model_auto["categorical"]
else:
Expand All @@ -84,7 +75,7 @@ def fit(self, X: np.ndarray, y: np.ndarray):

# Group of variables
if (y.ndim > 1) and (y.shape[1] > 1):
if self.data_type in ["binary", "categorical"]:
if self.data_type == "categorical":
self.model = MultiOutputClassifier(self.model)
elif self.data_type == "continuous" and not issubclass(
self.model.__class__, MultiOutputMixin
Expand Down Expand Up @@ -130,7 +121,7 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray
)
return y_hat[np.newaxis, ...] + residual_permuted

elif self.data_type in ["binary", "categorical"]:
elif self.data_type == "categorical":
if not hasattr(self.model, "predict_proba"):
raise AttributeError(
"The model must have a `predict_proba` method to be used for \
Expand Down
15 changes: 7 additions & 8 deletions test/test_conditional_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def test_binary_case():
np.random.seed(40)

sampler = ConditionalSampler(
model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
data_type="binary",
model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
data_type="categorical",
random_state=0,
)

Expand All @@ -78,9 +78,8 @@ def test_binary_case():
assert accuracy_score(X_1_perm[i], X[:, 1]) < 0.6

sampler = ConditionalSampler(
model_binary=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
model_regression=RidgeCV(alphas=np.logspace(-2, 2, 10)),
model_categorical=None,
model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
data_type="auto",
random_state=0,
)
Expand All @@ -99,7 +98,7 @@ def test_error():
np.random.seed(40)
sampler = ConditionalSampler(
model_regression=RidgeClassifier(),
data_type="binary",
data_type="categorical",
random_state=0,
)
X = np.random.randint(0, 2, size=(100, 2))
Expand All @@ -114,7 +113,7 @@ def test_error():
)
with pytest.raises(AttributeError):
sampler.fit(np.delete(X, 1, axis=1), X[:, 1])
sampler.sample()
sampler.sample(np.delete(X, 1, axis=1), X[:, 1])

sampler = ConditionalSampler(
data_type="auto",
Expand Down Expand Up @@ -161,8 +160,8 @@ def test_group_case():
X[:, 4] = 2 * X[:, 1] - 1 + np.random.randn(X.shape[0]) * 0.3 > 0
model = LogisticRegressionCV(Cs=np.logspace(-2, 2, 10))
sampler = ConditionalSampler(
model_binary=model,
data_type="binary",
model_categorical=model,
data_type="categorical",
random_state=0,
)
sampler.fit(X[:, :3], X[:, 3:])
Expand Down
2 changes: 1 addition & 1 deletion test/test_cpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_cpi(linear_scenario):
cpi = CPI(
estimator=regression_model,
imputation_model_continuous=clone(imputation_model),
imputation_model_binary=LogisticRegression(),
imputation_model_categorical=LogisticRegression(),
n_permutations=20,
method="predict",
random_state=0,
Expand Down

0 comments on commit 302f5c0

Please sign in to comment.