mind-inria · lionelkusch · Jan 21, 2025 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py
@@ -57,7 +57,7 @@ def _compute_residuals(
 
     else:
 
-        ValueError("The only regression method available is 'lasso'")
+        raise ValueError("The only regression method available is 'lasso'")
 
     clf.fit(X_new, y)
     z = y - clf.predict(X_new)

diff --git a/src/hidimstat/scenario.py b/src/hidimstat/scenario.py
@@ -59,19 +59,21 @@ def multivariate_1D_simulation(
 
     rng = np.random.default_rng(seed)
 
+    # generate random data for each samples
     X = np.zeros((n_samples, n_features))
     X[:, 0] = rng.standard_normal(n_samples)
-
     for i in np.arange(1, n_features):
         rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_samples)
         X[:, i] = rho * X[:, i - 1] + rand_vector
 
     if shuffle:
         rng.shuffle(X.T)
 
+    # generate the vector of variable of importances
     beta = np.zeros(n_features)
     beta[0:support_size] = 1.0
 
+    # generate the simulated regression data
     noise = sigma * rng.standard_normal(n_samples)
     y = np.dot(X, beta) + noise
 

diff --git a/src/hidimstat/setup.py b/src/hidimstat/setup.py
diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py
@@ -147,7 +147,7 @@ def _fixed_quantile_aggregation(pvals, gamma=0.5):
 def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
     """adaptive version of the quantile aggregation method, Meinshausen et al.
     (2008)"""
-    gammas = np.arange(gamma_min, 1.05, 0.05)
+    gammas = np.linspace(gamma_min, 1.0, 30)
     list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])
 
     return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))

diff --git a/test/test_clustered_inference.py b/test/test_clustered_inference.py
@@ -2,6 +2,7 @@
 Test the clustered_inference module
 """
 
+import pytest
 import numpy as np
 from numpy.testing import assert_almost_equal
 from sklearn.cluster import FeatureAgglomeration
@@ -14,16 +15,17 @@
 )
 
 
-def test_clustered_inference():
-    """Testing the procedure on two simulations with a 1D data structure and
-    with n << p: the first test has no temporal dimension, the second has a
-    temporal dimension. The support is connected and of size 10, it must be
-    recovered with a small spatial tolerance parametrized by `margin_size`.
+# Scenario 1: data with no temporal dimension
+def test_clustered_inference_no_temporal():
+    """
+    Testing the procedure on one simulations with a 1D data structure and
+    with n << p: no temporal dimension. The support is connected and of
+    size 10, it must be recovered with a small spatial tolerance
+    parametrized by `margin_size`.
     Computing one sided p-values, we want low p-values for the features of
-    the support and p-values close to 0.5 for the others."""
+    the support and p-values close to 0.5 for the others.
+    """
 
-    # Scenario 1: data with no temporal dimension
-    # ###########################################
     n_samples, n_features = 100, 2000
     support_size = 10
     sigma = 5.0
@@ -63,8 +65,17 @@ def test_clustered_inference():
         pval_corr[extended_support:200], expected[extended_support:200], decimal=1
     )
 
-    # Scenario 2: temporal data
-    # #########################
+
+# Scenario 2: temporal data
+def test_clustered_inference_temporal():
+    """
+    Testing the procedure on two simulations with a 1D data structure and
+    with n << p: with a temporal dimension. The support is connected and
+    of size 10, it must be recovered with a small spatial tolerance
+    parametrized by `margin_size`.
+    Computing one sided p-values, we want low p-values for the features of
+    the support and p-values close to 0.5 for the others.
+    """
     n_samples, n_features, n_times = 200, 2000, 10
     support_size = 10
     sigma = 5.0
@@ -104,3 +115,94 @@ def test_clustered_inference():
     assert_almost_equal(
         pval_corr[extended_support:], expected[extended_support:], decimal=1
     )
+
+
+# Scenario 3: data with no temporal dimension and with groups
+def test_clustered_inference_no_temporal_groups():
+    """
+    Testing the procedure on one simulations with a 1D data structure and
+    with n << p: no temporal dimension. The support is connected and of
+    size 10, it must be recovered with a small spatial tolerance
+    parametrized by `margin_size`.
+    We group the sample in 10 groups of size 10.
+    Computing one sided p-values, we want low p-values for the features of
+    the support and p-values close to 0.5 for the others.
+    """
+
+    n_samples, n_features = 20, 2000
+    support_size = 10
+    n_groups = 10
+    sigma = 5.0
+    rho = 0.95
+    n_clusters = 200
+    margin_size = 5
+    interior_support = support_size - margin_size
+    extended_support = support_size + margin_size
+
+    # create n_group of samples
+    X_ = []
+    y_ = []
+    for i in range(n_groups):
+        X_init, y, beta, epsilon = multivariate_1D_simulation(
+            n_samples=n_samples,
+            n_features=n_features,
+            support_size=support_size,
+            sigma=sigma,
+            rho=rho,
+            shuffle=False,
+            seed=2 + i,
+        )
+        X_.append(X_init)
+        y_.append(y)
+
+    y_ = np.concatenate(y_)
+    y_ = y_ - np.mean(y_)
+    X_ = np.concatenate(X_)
+    X_ = X_ - np.mean(X_, axis=0)
+    groups = np.repeat(np.arange(0, n_groups), n_samples)
+
+    connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
+    ward = FeatureAgglomeration(
+        n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
+    )
+
+    beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
+        clustered_inference(X_, y_, ward, n_clusters, groups=groups)
+    )
+
+    expected = 0.5 * np.ones(n_features)
+    expected[:support_size] = 0.0
+
+    assert_almost_equal(pval_corr[:interior_support], expected[:interior_support])
+    assert_almost_equal(
+        pval_corr[extended_support:200], expected[extended_support:200], decimal=1
+    )
+
+
+# Scenario 1: data with no temporal dimension
+def test_clustered_inference_exception_methods():
+    """
+    Testing the procedure on two simulations with a 1D data structure and
+    checking that the procedure raises an exception when an unknown method is
+    provided.
+    """
+    n_samples, n_features = 100, 2000
+    n_clusters = 200
+
+    X_init, y, beta, epsilon = multivariate_1D_simulation(
+        n_samples=n_samples,
+        n_features=n_features,
+        shuffle=False,
+        seed=2,
+    )
+
+    y = y - np.mean(y)
+    X_init = X_init - np.mean(X_init, axis=0)
+
+    connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
+    ward = FeatureAgglomeration(
+        n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
+    )
+
+    with pytest.raises(ValueError, match="Unknow method"):
+        clustered_inference(X_init, y, ward, n_clusters, method="lll")
diff --git a/test/test_dcrt.py b/test/test_dcrt.py
@@ -15,7 +15,7 @@ def test_dcrt_lasso():
     """
     X, y = make_regression(n_samples=100, n_features=10, noise=0.2, random_state=2024)
     # Checking if a loss != 'least_square'
-    with pytest.raises(Exception):
+    with pytest.raises(ValueError, match="test loss is not supported."):
         _ = dcrt_zero(
             X,
             y,
@@ -27,7 +27,7 @@ def test_dcrt_lasso():
         )
 
     # Checking for a different statistic
-    with pytest.raises(Exception):
+    with pytest.raises(ValueError, match="test statistic is not supported."):
         _ = dcrt_zero(
             X,
             y,
@@ -37,6 +37,30 @@ def test_dcrt_lasso():
             random_state=2024,
         )
 
+    # Checking for bad selection of screening_threshold
+    result_th_screen_bad = dcrt_zero(
+        X,
+        y,
+        screening_threshold=0,
+        screening=True,
+        verbose=False,
+    )
+    assert result_th_screen_bad.size == 0
+
+    # Checking for bad selection of screening_threshold with verbose
+    result_th_screen_bad = dcrt_zero(
+        X,
+        y,
+        screening_threshold=0,
+        screening=True,
+        verbose=True,
+    )
+
+    assert len(result_th_screen_bad) == 3
+    assert result_th_screen_bad[0].size == 0
+    assert np.all(result_th_screen_bad[1] == np.ones(10))
+    assert np.all(result_th_screen_bad[2] == np.zeros(10))
+
     # Checking with and without screening
     results_no_screening = dcrt_zero(
         X, y, screening=False, verbose=True, statistic="residual", random_state=2024
@@ -70,7 +94,8 @@ def test_dcrt_lasso():
         X,
         y,
         refit=True,
-        screening=False,
+        screening=True,
+        screening_threshold=50,
         verbose=True,
         statistic="residual",
         random_state=2024,

diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py
@@ -5,6 +5,7 @@
 import numpy as np
 from numpy.testing import assert_almost_equal, assert_equal
 from scipy.linalg import toeplitz
+import pytest
 
 from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso
 from hidimstat.scenario import (
@@ -48,6 +49,16 @@ def test_desparsified_lasso():
     assert_equal(cb_max > beta, True)
 
 
+def test_desparsified_lasso_exception():
+    """Testing exception of not using lasso"""
+
+    X, y, beta, noise = multivariate_1D_simulation()
+    with pytest.raises(
+        ValueError, match="The only regression method available is 'lasso'"
+    ):
+        _ = desparsified_lasso(X, y, residual_method="test")
+
+
 def test_desparsified_group_lasso():
     """Testing the procedure on a simulation with no structure and
     a support of size 2. Computing one-sided p-values, we want

diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py
@@ -3,6 +3,7 @@
 """
 
 import numpy as np
+import pytest
 from numpy.testing import assert_almost_equal
 from sklearn.cluster import FeatureAgglomeration
 from sklearn.feature_extraction import image
@@ -125,3 +126,54 @@ def test_ensemble_clustered_inference():
     assert_almost_equal(
         pval_corr[extended_support:], expected[extended_support:], decimal=1
     )
+
+    beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
+        ensemble_clustered_inference(
+            X,
+            Y,
+            ward,
+            n_clusters,
+            n_bootstraps=n_bootstraps,
+            inference_method=inference_method,
+            ensembling_method="medians",
+        )
+    )
+
+    expected = 0.5 * np.ones(n_features)
+    expected[:support_size] = 0.0
+
+    assert_almost_equal(
+        pval_corr[:interior_support], expected[:interior_support], decimal=3
+    )
+    assert_almost_equal(
+        pval_corr[extended_support:], expected[extended_support:], decimal=1
+    )
+
+
+def test_ensemble_clustered_inference_exception():
+    """
+    Test the raise of exception
+    """
+    n_samples, n_features = 100, 2000
+    n_clusters = 10
+    X, Y, beta, epsilon = multivariate_1D_simulation(
+        n_samples=n_samples,
+        n_features=n_features,
+    )
+    connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
+    ward = FeatureAgglomeration(
+        n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
+    )
+
+    # Test the raise of exception
+    with pytest.raises(ValueError, match="Unknown ensembling method."):
+        ensemble_clustered_inference(
+            X, Y, ward, n_clusters, ensembling_method="wrong_method"
+        )
+
+    with pytest.raises(
+        ValueError,
+        match="'memory' must be None or a string corresponding "
+        + "to the path of the caching directory.",
+    ):
+        ensemble_clustered_inference(X, Y, ward, n_clusters, memory=[])