From 6507587574279be257a8cd8f7649d828ef323bcb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 15:21:03 +0100 Subject: [PATCH 01/26] add test for clsuter and a method for getting groups --- hidimstat/scenario.py | 30 +++++- hidimstat/test/test_clustered_inference.py | 119 +++++++++++++++++++-- 2 files changed, 134 insertions(+), 15 deletions(-) diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py index 9b8d4a5c..113c875a 100644 --- a/hidimstat/scenario.py +++ b/hidimstat/scenario.py @@ -16,6 +16,8 @@ def multivariate_1D_simulation( rho=0.0, shuffle=True, seed=0, + nb_group=0, + size_group=0, ): """Generate 1D data with Toeplitz design matrix @@ -41,11 +43,18 @@ def multivariate_1D_simulation( seed : int Seed used for generating design matrix and noise. + + nb_group : int + Number of groups. + + size_group : int + Size of each group. Returns ------- X : ndarray, shape (n_samples, n_features) Design matrix. + if their is some groups, the first rows contains the groups values. y : ndarray, shape (n_samples,) Target. @@ -58,17 +67,30 @@ def multivariate_1D_simulation( """ rng = np.random.default_rng(seed) - - X = np.zeros((n_samples, n_features)) - X[:, 0] = rng.standard_normal(n_samples) + if nb_group < 0 or size_group < 0: + raise ValueError( + "The number of groups and their size must be positive." + ) + n_generate_samples = n_samples - nb_group * size_group + if n_generate_samples <= 0: + raise ValueError( + "The number of samples is too small compate to the number " + "of group and their size to gerate the data." + ) + X = np.zeros((n_generate_samples, n_features)) + X[:, 0] = rng.standard_normal(n_generate_samples) for i in np.arange(1, n_features): - rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_samples) + rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_generate_samples) X[:, i] = rho * X[:, i - 1] + rand_vector if shuffle: rng.shuffle(X.T) + if nb_group > 0: + groups = np.repeat(X[:nb_group], size_group, axis=0) + X = np.vstack((groups, X)) + beta = np.zeros(n_features) beta[0:support_size] = 1.0 diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py index 94e04ef0..13fdcd42 100644 --- a/hidimstat/test/test_clustered_inference.py +++ b/hidimstat/test/test_clustered_inference.py @@ -1,7 +1,7 @@ """ Test the clustered_inference module """ - +import pytest import numpy as np from numpy.testing import assert_almost_equal from sklearn.cluster import FeatureAgglomeration @@ -14,16 +14,17 @@ ) -def test_clustered_inference(): - """Testing the procedure on two simulations with a 1D data structure and - with n << p: the first test has no temporal dimension, the second has a - temporal dimension. The support is connected and of size 10, it must be - recovered with a small spatial tolerance parametrized by `margin_size`. +# Scenario 1: data with no temporal dimension +def test_clustered_inference_no_temporal(): + """ + Testing the procedure on one simulations with a 1D data structure and + with n << p: no temporal dimension. The support is connected and of + size 10, it must be recovered with a small spatial tolerance + parametrized by `margin_size`. Computing one sided p-values, we want low p-values for the features of - the support and p-values close to 0.5 for the others.""" + the support and p-values close to 0.5 for the others. + """ - # Scenario 1: data with no temporal dimension - # ########################################### n_samples, n_features = 100, 2000 support_size = 10 sigma = 5.0 @@ -63,8 +64,17 @@ def test_clustered_inference(): pval_corr[extended_support:200], expected[extended_support:200], decimal=1 ) - # Scenario 2: temporal data - # ######################### + +# Scenario 2: temporal data +def test_clustered_inference_temporal(): + """ + Testing the procedure on two simulations with a 1D data structure and + with n << p: with a temporal dimension. The support is connected and + of size 10, it must be recovered with a small spatial tolerance + parametrized by `margin_size`. + Computing one sided p-values, we want low p-values for the features of + the support and p-values close to 0.5 for the others. + """ n_samples, n_features, n_times = 200, 2000, 10 support_size = 10 sigma = 5.0 @@ -104,3 +114,90 @@ def test_clustered_inference(): assert_almost_equal( pval_corr[extended_support:], expected[extended_support:], decimal=1 ) + + +# Scenario 3: data with no temporal dimension and with groups +def test_clustered_inference_no_temporal_groups(): + """ + Testing the procedure on one simulations with a 1D data structure and + with n << p: no temporal dimension. The support is connected and of + size 10, it must be recovered with a small spatial tolerance + parametrized by `margin_size`. + We group the sample in 10 groups of size 10. + Computing one sided p-values, we want low p-values for the features of + the support and p-values close to 0.5 for the others. + """ + + n_samples, n_features = 100, 2000 + support_size = 10 + nb_groups = 9 + size = 10 + sigma = 5.0 + rho = 0.95 + n_clusters = 200 + margin_size = 5 + interior_support = support_size - margin_size + extended_support = support_size + margin_size + + X_init, y, beta, epsilon = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + sigma=sigma, + rho=rho, + shuffle=False, + seed=2, + nb_group=nb_groups, + size_group=size, + ) + groups = np.concatenate([[i]*size for i in range(nb_groups+1)]) + + y = y - np.mean(y) + X_init = X_init - np.mean(X_init, axis=0) + + connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) + ward = FeatureAgglomeration( + n_clusters=n_clusters, connectivity=connectivity, linkage="ward" + ) + + beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + clustered_inference(X_init, y, ward, n_clusters, groups=groups) + ) + + expected = 0.5 * np.ones(n_features) + expected[:support_size] = 0.0 + + assert_almost_equal(pval_corr[:interior_support], expected[:interior_support]) + assert_almost_equal( + pval_corr[extended_support:200], expected[extended_support:200], decimal=1 + ) + + + +# Scenario 1: data with no temporal dimension +def test_clustered_inference_exception_methods(): + """ + Testing the procedure on two simulations with a 1D data structure and + checking that the procedure raises an exception when an unknown method is + provided. + """ + n_samples, n_features = 100, 2000 + n_clusters = 200 + + X_init, y, beta, epsilon = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + shuffle=False, + seed=2, + ) + + y = y - np.mean(y) + X_init = X_init - np.mean(X_init, axis=0) + + connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) + ward = FeatureAgglomeration( + n_clusters=n_clusters, connectivity=connectivity, linkage="ward" + ) + + with pytest.raises(ValueError, match="Unknow method"): + clustered_inference(X_init, y, ward, n_clusters, method="lll") \ No newline at end of file From 2233a5f84d448ee88e2ad0e838c75b4aaf1ddc43 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 15:21:31 +0100 Subject: [PATCH 02/26] Add test for dcrt --- hidimstat/test/test_dcrt.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/hidimstat/test/test_dcrt.py b/hidimstat/test/test_dcrt.py index da2085cb..ac2b2061 100644 --- a/hidimstat/test/test_dcrt.py +++ b/hidimstat/test/test_dcrt.py @@ -15,7 +15,7 @@ def test_dcrt_lasso(): """ X, y = make_regression(n_samples=100, n_features=10, noise=0.2, random_state=2024) # Checking if a loss != 'least_square' - with pytest.raises(Exception): + with pytest.raises(ValueError, match="test loss is not supported."): _ = dcrt_zero( X, y, @@ -27,7 +27,7 @@ def test_dcrt_lasso(): ) # Checking for a different statistic - with pytest.raises(Exception): + with pytest.raises(ValueError, match="test statistic is not supported."): _ = dcrt_zero( X, y, @@ -36,6 +36,30 @@ def test_dcrt_lasso(): statistic="test", random_state=2024, ) + + # Checking for bad selection of screening_threshold + result_th_screen_bad = dcrt_zero( + X, + y, + screening_threshold = 0, + screening=True, + verbose=False, + ) + assert result_th_screen_bad.size == 0 + + # Checking for bad selection of screening_threshold with verbose + result_th_screen_bad = dcrt_zero( + X, + y, + screening_threshold = 0, + screening=True, + verbose=True, + ) + + assert len(result_th_screen_bad) == 3 + assert result_th_screen_bad[0].size == 0 + assert np.all(result_th_screen_bad[1] == np.ones(10)) + assert np.all(result_th_screen_bad[2] == np.zeros(10)) # Checking with and without screening results_no_screening = dcrt_zero( @@ -70,7 +94,8 @@ def test_dcrt_lasso(): X, y, refit=True, - screening=False, + screening=True, + screening_threshold = 50, verbose=True, statistic="residual", random_state=2024, From cb02e0c41fa47a6ae0287711e582be62494ed9f6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 15:21:57 +0100 Subject: [PATCH 03/26] Add commit for desparsified functions and fixed one bug --- hidimstat/desparsified_lasso.py | 2 +- hidimstat/test/test_desparsified_lasso.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/hidimstat/desparsified_lasso.py b/hidimstat/desparsified_lasso.py index aaa5e03e..5081a47c 100644 --- a/hidimstat/desparsified_lasso.py +++ b/hidimstat/desparsified_lasso.py @@ -57,7 +57,7 @@ def _compute_residuals( else: - ValueError("The only regression method available is 'lasso'") + raise ValueError("The only regression method available is 'lasso'") clf.fit(X_new, y) z = y - clf.predict(X_new) diff --git a/hidimstat/test/test_desparsified_lasso.py b/hidimstat/test/test_desparsified_lasso.py index 713bd2c5..007c26cc 100644 --- a/hidimstat/test/test_desparsified_lasso.py +++ b/hidimstat/test/test_desparsified_lasso.py @@ -5,6 +5,7 @@ import numpy as np from numpy.testing import assert_almost_equal, assert_equal from scipy.linalg import toeplitz +import pytest from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso from hidimstat.scenario import ( @@ -48,6 +49,15 @@ def test_desparsified_lasso(): assert_equal(cb_max > beta, True) +def test_desparsified_lasso_exception(): + """Testing exception of not using lasso""" + + X, y, beta, noise = multivariate_1D_simulation( + ) + with pytest.raises(ValueError, match="The only regression method available is 'lasso'"): + _ = desparsified_lasso(X, y, residual_method='test') + + def test_desparsified_group_lasso(): """Testing the procedure on a simulation with no structure and a support of size 2. Computing one-sided p-values, we want From d36dc6818aa370f2b1edafa54e15230f5cfd34ac Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 16:14:42 +0100 Subject: [PATCH 04/26] Add tests for ensemble clustered --- .../test/test_ensemble_clustered_inference.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/hidimstat/test/test_ensemble_clustered_inference.py b/hidimstat/test/test_ensemble_clustered_inference.py index 5129a229..39381108 100644 --- a/hidimstat/test/test_ensemble_clustered_inference.py +++ b/hidimstat/test/test_ensemble_clustered_inference.py @@ -3,6 +3,7 @@ """ import numpy as np +import pytest from numpy.testing import assert_almost_equal from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image @@ -125,3 +126,63 @@ def test_ensemble_clustered_inference(): assert_almost_equal( pval_corr[extended_support:], expected[extended_support:], decimal=1 ) + + beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + ensemble_clustered_inference( + X, + Y, + ward, + n_clusters, + n_bootstraps=n_bootstraps, + inference_method=inference_method, + ensembling_method="medians" + ) + ) + + expected = 0.5 * np.ones(n_features) + expected[:support_size] = 0.0 + + assert_almost_equal( + pval_corr[:interior_support], expected[:interior_support], decimal=3 + ) + assert_almost_equal( + pval_corr[extended_support:], expected[extended_support:], decimal=1 + ) + + + + +def test_ensemble_clustered_inference_exception(): + """ + Test the raise of exception + """ + n_samples, n_features = 100, 2000 + n_clusters = 10 + X, Y, beta, epsilon = multivariate_1D_simulation( + n_samples=n_samples, n_features=n_features, + ) + connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) + ward = FeatureAgglomeration( + n_clusters=n_clusters, connectivity=connectivity, linkage="ward" + ) + + # Test the raise of exception + with pytest.raises(ValueError, match="Unknown ensembling method."): + ensemble_clustered_inference( + X, + Y, + ward, + n_clusters, + ensembling_method="wrong_method" + ) + + + with pytest.raises(ValueError, match="'memory' must be None or a string corresponding " + + "to the path of the caching directory."): + ensemble_clustered_inference( + X, + Y, + ward, + n_clusters, + memory=[] + ) \ No newline at end of file From 65282b56f8ae25f27e2b669b2be8ba3041f32afb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 16:44:29 +0100 Subject: [PATCH 05/26] Add test for the knockoff functions. --- hidimstat/test/test_generate_knockoff.py | 37 ++++++++++++++++++--- hidimstat/test/test_knockoff_aggregation.py | 23 ++++++++++--- hidimstat/test/test_model_x_knockoff.py | 15 +++++++++ 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/hidimstat/test/test_generate_knockoff.py b/hidimstat/test/test_generate_knockoff.py index 54056f79..44430ec7 100644 --- a/hidimstat/test/test_generate_knockoff.py +++ b/hidimstat/test/test_generate_knockoff.py @@ -6,12 +6,12 @@ _estimate_distribution, gaussian_knockoff_generation, ) +import pytest -SEED = 42 -fdr = 0.1 -def test_estimate_distribution(): +def test_estimate_distribution_wolf(): + SEED = 42 n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=SEED) @@ -20,18 +20,45 @@ def test_estimate_distribution(): assert mu.size == p assert Sigma.shape == (p, p) - mu, Sigma = _estimate_distribution(X, cov_estimator="graph_lasso") + +def test_estimate_distribution_lasso(): + SEED = 42 + n = 100 + p = 50 + X, y, _, non_zero = simu_data(n, p, seed=SEED) + mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="graph_lasso") assert mu.size == p assert Sigma.shape == (p, p) +def test_gaussian_knockoff_estimate_exception(): + SEED = 42 + n = 100 + p = 50 + X, y, _, non_zero = simu_data(n, p, seed=SEED) + with pytest.raises(ValueError, match="test is not a valid covariance estimated method"): + _estimate_distribution(X, shrink=True, cov_estimator="test") + def test_gaussian_knockoff_equi(): + SEED = 42 n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, cov_estimator="ledoit_wolf") + mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") X_tilde = gaussian_knockoff_generation(X, mu, Sigma, method="equi", seed=SEED * 2) assert X_tilde.shape == (n, p) + + +def test_gaussian_knockoff_exception(): + SEED = 42 + n = 100 + p = 50 + X, y, _, non_zero = simu_data(n, p, seed=SEED) + mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") + + with pytest.raises(ValueError, match="test is not a valid knockoff contriction method"): + gaussian_knockoff_generation(X, mu, Sigma, method="test", seed=SEED * 2) + diff --git a/hidimstat/test/test_knockoff_aggregation.py b/hidimstat/test/test_knockoff_aggregation.py index a2984fc7..992ef6dd 100644 --- a/hidimstat/test/test_knockoff_aggregation.py +++ b/hidimstat/test/test_knockoff_aggregation.py @@ -54,6 +54,21 @@ def test_knockoff_aggregation(): assert fdp_verbose < 0.5 assert power_verbose > 0.1 + selected_ko, test_scored, thres, X_tilde = model_x_knockoff(X, y, fdr=fdr, seed=5, verbose=True) + #TODO add tests for the 3 other variables + + np.testing.assert_array_equal(selected_no_verbose, selected_ko) + + fdp_no_verbose, power_no_verbose = cal_fdp_power( + selected_no_verbose, non_zero_index + ) + fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) + + assert fdp_verbose < 0.5 + assert power_verbose > 0.1 + + + # Using e-values aggregation (verbose vs no verbose) selected_verbose, aggregated_pval, pvals = knockoff_aggregation( @@ -89,7 +104,7 @@ def test_knockoff_aggregation(): assert power_no_verbose > 0.1 # Checking wrong type for random_state - with pytest.raises(Exception): + with pytest.raises(TypeError, match="Wrong type for random_state"): _ = knockoff_aggregation( X, y, @@ -97,20 +112,18 @@ def test_knockoff_aggregation(): ) # Checking value for offset not belonging to (0,1) - with pytest.raises(Exception): + with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): _ = knockoff_aggregation( X, y, offset=2, method="quantile", - random_state="test", ) - with pytest.raises(Exception): + with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): _ = knockoff_aggregation( X, y, offset=2, method="e-values", - random_state="test", ) diff --git a/hidimstat/test/test_model_x_knockoff.py b/hidimstat/test/test_model_x_knockoff.py index 504e4694..6caa3902 100644 --- a/hidimstat/test/test_model_x_knockoff.py +++ b/hidimstat/test/test_model_x_knockoff.py @@ -16,3 +16,18 @@ def test_model_x_knockoff(): assert fdp <= 0.2 assert power > 0.7 + + +def test_model_x_knockoff_with_verbose(): + n = 300 + p = 300 + X, y, _, non_zero = simu_data(n, p, seed=seed) + ko_result, test_scored, thres, X_tilde = model_x_knockoff(X, y, fdr=fdr, seed=5, verbose=True) + #TODO add tests for the 3 other variables + + fdp, power = cal_fdp_power(ko_result, non_zero) + + assert fdp <= 0.2 + assert power > 0.7 + + From 968500c85d12572c00aa6a257fb73b1d12bbc901 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 17:00:43 +0100 Subject: [PATCH 06/26] Add test for Reid methods --- hidimstat/test/test_noise_std.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/hidimstat/test/test_noise_std.py b/hidimstat/test/test_noise_std.py index cb890529..a5cd94b7 100644 --- a/hidimstat/test/test_noise_std.py +++ b/hidimstat/test/test_noise_std.py @@ -3,6 +3,7 @@ """ import numpy as np +import pytest from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz @@ -129,6 +130,28 @@ def test_group_reid(): assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) +def test_group_reid_exception(): + """ + Test the exception of group reid + """ + n_samples = 30 + n_features = 50 + n_times = 10 + X, Y, beta, noise = multivariate_temporal_simulation( + n_samples=n_samples, + n_features=n_features, + n_times=n_times, + ) + # max_iter=1 to get a better coverage + with pytest.raises(ValueError, match= "The requested AR order is to high with " + + "respect to the number of time steps." + ): + group_reid(X, Y, stationary=True, method="AR", order = 100) + + with pytest.raises(ValueError, match= "Unknown method for estimating the covariance matrix"): + group_reid(X, Y, method="test", order = 100) + + def test_empirical_snr(): """Computing empirical signal to noise ratio from the target `y`, the data `X` and the true parameter vector `beta` in a simple From 05628cf17c2cb7a97e28a792662afe42214d444a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 17:04:16 +0100 Subject: [PATCH 07/26] Add test for permutation_test --- hidimstat/test/test_permutation_test.py | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/hidimstat/test/test_permutation_test.py b/hidimstat/test/test_permutation_test.py index 6e1835db..c50162dc 100644 --- a/hidimstat/test/test_permutation_test.py +++ b/hidimstat/test/test_permutation_test.py @@ -38,3 +38,33 @@ def test_permutation_test(): expected[:support_size] = 0.0 assert_almost_equal(pval_corr, expected, decimal=1) + +def test_permutation_test_with_SVR(): + """Testing the procedure on a simulation with no structure and a support + of size 1. Computing one-sided p-values, we want a low p-value + for the first feature and p-values close to 0.5 for the others.""" + + n_samples, n_features = 20, 50 + support_size = 1 + sigma = 0.1 + rho = 0.0 + + X_init, y, beta, noise = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + sigma=sigma, + rho=rho, + shuffle=False, + seed=3, + ) + + y = y - np.mean(y) + X_init = X_init - np.mean(X_init, axis=0) + + pval_corr, one_minus_pval_corr = permutation_test_cv(X_init, y, n_permutations=100, C=0.1) + + expected = 0.5 * np.ones(n_features) + expected[:support_size] = 0.0 + + assert_almost_equal(pval_corr, expected, decimal=1) \ No newline at end of file From 744f2011d544d60751273e4712204c1cbd10c6a1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 17:28:39 +0100 Subject: [PATCH 08/26] Add tests and fixe name of one variable --- hidimstat/scenario.py | 10 +++--- hidimstat/test/test_clustered_inference.py | 2 +- hidimstat/test/test_scenario.py | 38 ++++++++++++++++++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py index 113c875a..de0de0fe 100644 --- a/hidimstat/scenario.py +++ b/hidimstat/scenario.py @@ -17,7 +17,7 @@ def multivariate_1D_simulation( shuffle=True, seed=0, nb_group=0, - size_group=0, + group_size=0, ): """Generate 1D data with Toeplitz design matrix @@ -47,7 +47,7 @@ def multivariate_1D_simulation( nb_group : int Number of groups. - size_group : int + group_size : int Size of each group. Returns @@ -67,11 +67,11 @@ def multivariate_1D_simulation( """ rng = np.random.default_rng(seed) - if nb_group < 0 or size_group < 0: + if nb_group < 0 or group_size < 0: raise ValueError( "The number of groups and their size must be positive." ) - n_generate_samples = n_samples - nb_group * size_group + n_generate_samples = n_samples - nb_group * group_size if n_generate_samples <= 0: raise ValueError( "The number of samples is too small compate to the number " @@ -88,7 +88,7 @@ def multivariate_1D_simulation( rng.shuffle(X.T) if nb_group > 0: - groups = np.repeat(X[:nb_group], size_group, axis=0) + groups = np.repeat(X[:nb_group], group_size, axis=0) X = np.vstack((groups, X)) beta = np.zeros(n_features) diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py index 13fdcd42..cdecbacf 100644 --- a/hidimstat/test/test_clustered_inference.py +++ b/hidimstat/test/test_clustered_inference.py @@ -148,7 +148,7 @@ def test_clustered_inference_no_temporal_groups(): shuffle=False, seed=2, nb_group=nb_groups, - size_group=size, + group_size=size, ) groups = np.concatenate([[i]*size for i in range(nb_groups+1)]) diff --git a/hidimstat/test/test_scenario.py b/hidimstat/test/test_scenario.py index 503f6848..4441ffc3 100644 --- a/hidimstat/test/test_scenario.py +++ b/hidimstat/test/test_scenario.py @@ -3,6 +3,7 @@ """ import numpy as np +import pytest from numpy.testing import assert_almost_equal, assert_equal from hidimstat.scenario import ( @@ -55,6 +56,41 @@ def test_multivariate_1D_simulation(): X, y, beta, noise = multivariate_1D_simulation() rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] assert_almost_equal(rho_hat, 0, decimal=1) + + # Test 3 + X, y, beta, noise = multivariate_1D_simulation(n_samples=9, nb_group=2, group_size=3) + corr_X = np.corrcoef(X) + assert_almost_equal(corr_X[0, 0], 1, decimal=1) + assert_almost_equal(corr_X[0, 1], 1, decimal=1) + assert_almost_equal(corr_X[0, 2], 1, decimal=1) + assert_almost_equal(corr_X[1, 0], 1, decimal=1) + assert_almost_equal(corr_X[1, 1], 1, decimal=1) + assert_almost_equal(corr_X[1, 2], 1, decimal=1) + assert_almost_equal(corr_X[2, 0], 1, decimal=1) + assert_almost_equal(corr_X[2, 1], 1, decimal=1) + assert_almost_equal(corr_X[2, 2], 1, decimal=1) + assert_almost_equal(corr_X[3, 3], 1, decimal=1) + assert_almost_equal(corr_X[3, 4], 1, decimal=1) + assert_almost_equal(corr_X[3, 5], 1, decimal=1) + assert_almost_equal(corr_X[4, 3], 1, decimal=1) + assert_almost_equal(corr_X[4, 4], 1, decimal=1) + assert_almost_equal(corr_X[4, 5], 1, decimal=1) + assert_almost_equal(corr_X[5, 3], 1, decimal=1) + assert_almost_equal(corr_X[5, 4], 1, decimal=1) + assert_almost_equal(corr_X[5, 5], 1, decimal=1) + + + +def test_multivariate_1D_simulation_exception(): + """ + Test when the input paramters is not correct. + """ + with pytest.raises(ValueError, match="The number of groups and their size must be positive."): + multivariate_1D_simulation(nb_group=-1) + + with pytest.raises(ValueError, match="The number of samples is too small compate to the number " + "of group and their size to gerate the data."): + multivariate_1D_simulation(n_samples=10, nb_group=2, group_size=6) def test_multivariate_simulation(): @@ -162,3 +198,5 @@ def test_multivariate_temporal_simulation(): rho_data_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] assert_almost_equal(rho_data_hat, rho_data, decimal=1) assert_equal(Y, np.dot(X, beta) + noise) + + From f3473c7b065187e103adb56a7fca08009f1dbbc1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 18:11:30 +0100 Subject: [PATCH 09/26] Add test and change name of variables --- hidimstat/test/test_utils.py | 50 ++++++++++++++++++++++++++---------- hidimstat/utils.py | 14 +++++----- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/hidimstat/test/test_utils.py b/hidimstat/test/test_utils.py index cc1c1190..24bbf74e 100644 --- a/hidimstat/test/test_utils.py +++ b/hidimstat/test/test_utils.py @@ -6,6 +6,33 @@ seed = 42 +def test_quantile_aggregation(): + """ + This function tests the application of the quantile aggregation method + """ + col = np.arange(11) + p_values = np.tile(col, (10, 1)).T / 100 + gamma_min = 0.05 + + assert_array_almost_equal(0.1 * quantile_aggregation(p_values, 0.1), [0.01] * 10) + assert_array_almost_equal(0.1 * quantile_aggregation(p_values, 0.1, adaptive=True, gamma_min=gamma_min) + / (1 - np.log(gamma_min)), [0.01] * 10) + assert_array_almost_equal(0.3 * quantile_aggregation(p_values, 0.3), [0.03] * 10) + assert_array_almost_equal(0.3 * quantile_aggregation(p_values, 0.3, adaptive=True, gamma_min=gamma_min) + / (1 - np.log(gamma_min)), [0.03] * 10) + assert_array_almost_equal(0.5 * quantile_aggregation(p_values, 0.5), [0.05] * 10) + assert_array_almost_equal(0.5 * quantile_aggregation(p_values, 0.5, adaptive=True, gamma_min=gamma_min) + / (1 - np.log(gamma_min)), [0.05] * 10) + + # One p-value within the quantile aggregation method + p_values = np.array([0.0]) + + assert quantile_aggregation(p_values) == 0.0 + assert quantile_aggregation(p_values, adaptive=True) == 0.0 + + + + def test_fdr_threshold(): """ This function tests the application of the False Discovery Rate (FDR) @@ -50,19 +77,14 @@ def test_cal_fdp_power(): assert fdp == 2 / len(selected) assert power == 18 / len(non_zero_index) + # test for zero selection + fdp, power = cal_fdp_power(np.array([]), non_zero_index) -def test_quantile_aggregation(): - """ - This function tests the application of the quantile aggregation method - """ - col = np.arange(11) - p_values = np.tile(col, (10, 1)).T / 100 - - assert_array_almost_equal(0.1 * quantile_aggregation(p_values, 0.1), [0.01] * 10) - assert_array_almost_equal(0.3 * quantile_aggregation(p_values, 0.3), [0.03] * 10) - assert_array_almost_equal(0.5 * quantile_aggregation(p_values, 0.5), [0.05] * 10) - - # One p-value within the quantile aggregation method - p_values = np.array([0.0]) + assert fdp == 0 + assert power == 0 - assert quantile_aggregation(p_values) == 0.0 + # test for shift in index + fdp, power = cal_fdp_power(selected+1, non_zero_index, r_index=True) + + assert fdp == 2 / len(selected) + assert power == 18 / len(non_zero_index) \ No newline at end of file diff --git a/hidimstat/utils.py b/hidimstat/utils.py index efdc9336..418ee85a 100644 --- a/hidimstat/utils.py +++ b/hidimstat/utils.py @@ -84,19 +84,19 @@ def _bhq_threshold(pvals, fdr=0.1): return -1.0 -def _ebh_threshold(evals, fdr=0.1): +def _ebh_threshold(pvals, fdr=0.1): """e-BH procedure for FDR control (see Wang and Ramdas 2021)""" - n_features = len(evals) - evals_sorted = -np.sort(-evals) # sort in descending order + n_features = len(pvals) + pvals_sorted = -np.sort(-pvals) # sort in descending order selected_index = 2 * n_features for i in range(n_features - 1, -1, -1): - if evals_sorted[i] >= n_features / (fdr * (i + 1)): + if pvals_sorted[i] >= n_features / (fdr * (i + 1)): selected_index = i break if selected_index <= n_features: - return evals_sorted[selected_index] + return pvals_sorted[selected_index] else: - return np.infty + return np.inf def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): @@ -147,7 +147,7 @@ def _fixed_quantile_aggregation(pvals, gamma=0.5): def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): """adaptive version of the quantile aggregation method, Meinshausen et al. (2008)""" - gammas = np.arange(gamma_min, 1.05, 0.05) + gammas = np.linspace(gamma_min, 1., 30) list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) From 9b872a83a994639dc56f884fe2088f413d61d9da Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 16 Dec 2024 18:15:33 +0100 Subject: [PATCH 10/26] Unecessary file --- hidimstat/setup.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 hidimstat/setup.py diff --git a/hidimstat/setup.py b/hidimstat/setup.py deleted file mode 100644 index 15c6389d..00000000 --- a/hidimstat/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -def configuration(parent_package="", top_path=None): - from numpy.distutils.misc_util import Configuration - - config = Configuration("plotting", parent_package, top_path) - - config.add_subpackage("tests") - - return config - - -if __name__ == "__main__": - from numpy.distutils.core import setup - - setup(**configuration(top_path="").todict()) From 4683e5880198f93d6b814c002cc158d3099a0fbb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 18 Dec 2024 12:49:06 +0100 Subject: [PATCH 11/26] Fix typo --- hidimstat/scenario.py | 10 +++--- hidimstat/test/test_clustered_inference.py | 22 ++++++------- hidimstat/test/test_dcrt.py | 28 ++++++++-------- hidimstat/test/test_desparsified_lasso.py | 9 ++--- .../test/test_ensemble_clustered_inference.py | 33 +++++++------------ hidimstat/test/test_generate_knockoff.py | 11 ++++--- hidimstat/test/test_knockoff_aggregation.py | 8 ++--- hidimstat/test/test_loco.py | 1 - hidimstat/test/test_model_x_knockoff.py | 8 ++--- hidimstat/test/test_noise_std.py | 18 ++++++---- hidimstat/test/test_permutation_test.py | 7 ++-- hidimstat/test/test_scenario.py | 24 ++++++++------ hidimstat/test/test_utils.py | 32 +++++++++++------- hidimstat/utils.py | 2 +- 14 files changed, 113 insertions(+), 100 deletions(-) diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py index de0de0fe..88130c9b 100644 --- a/hidimstat/scenario.py +++ b/hidimstat/scenario.py @@ -43,10 +43,10 @@ def multivariate_1D_simulation( seed : int Seed used for generating design matrix and noise. - + nb_group : int Number of groups. - + group_size : int Size of each group. @@ -68,10 +68,8 @@ def multivariate_1D_simulation( rng = np.random.default_rng(seed) if nb_group < 0 or group_size < 0: - raise ValueError( - "The number of groups and their size must be positive." - ) - n_generate_samples = n_samples - nb_group * group_size + raise ValueError("The number of groups and their size must be positive.") + n_generate_samples = n_samples - nb_group * group_size if n_generate_samples <= 0: raise ValueError( "The number of samples is too small compate to the number " diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py index cdecbacf..490b5365 100644 --- a/hidimstat/test/test_clustered_inference.py +++ b/hidimstat/test/test_clustered_inference.py @@ -1,6 +1,7 @@ """ Test the clustered_inference module """ + import pytest import numpy as np from numpy.testing import assert_almost_equal @@ -18,8 +19,8 @@ def test_clustered_inference_no_temporal(): """ Testing the procedure on one simulations with a 1D data structure and - with n << p: no temporal dimension. The support is connected and of - size 10, it must be recovered with a small spatial tolerance + with n << p: no temporal dimension. The support is connected and of + size 10, it must be recovered with a small spatial tolerance parametrized by `margin_size`. Computing one sided p-values, we want low p-values for the features of the support and p-values close to 0.5 for the others. @@ -69,8 +70,8 @@ def test_clustered_inference_no_temporal(): def test_clustered_inference_temporal(): """ Testing the procedure on two simulations with a 1D data structure and - with n << p: with a temporal dimension. The support is connected and - of size 10, it must be recovered with a small spatial tolerance + with n << p: with a temporal dimension. The support is connected and + of size 10, it must be recovered with a small spatial tolerance parametrized by `margin_size`. Computing one sided p-values, we want low p-values for the features of the support and p-values close to 0.5 for the others. @@ -120,10 +121,10 @@ def test_clustered_inference_temporal(): def test_clustered_inference_no_temporal_groups(): """ Testing the procedure on one simulations with a 1D data structure and - with n << p: no temporal dimension. The support is connected and of - size 10, it must be recovered with a small spatial tolerance - parametrized by `margin_size`. - We group the sample in 10 groups of size 10. + with n << p: no temporal dimension. The support is connected and of + size 10, it must be recovered with a small spatial tolerance + parametrized by `margin_size`. + We group the sample in 10 groups of size 10. Computing one sided p-values, we want low p-values for the features of the support and p-values close to 0.5 for the others. """ @@ -150,7 +151,7 @@ def test_clustered_inference_no_temporal_groups(): nb_group=nb_groups, group_size=size, ) - groups = np.concatenate([[i]*size for i in range(nb_groups+1)]) + groups = np.concatenate([[i] * size for i in range(nb_groups + 1)]) y = y - np.mean(y) X_init = X_init - np.mean(X_init, axis=0) @@ -171,7 +172,6 @@ def test_clustered_inference_no_temporal_groups(): assert_almost_equal( pval_corr[extended_support:200], expected[extended_support:200], decimal=1 ) - # Scenario 1: data with no temporal dimension @@ -200,4 +200,4 @@ def test_clustered_inference_exception_methods(): ) with pytest.raises(ValueError, match="Unknow method"): - clustered_inference(X_init, y, ward, n_clusters, method="lll") \ No newline at end of file + clustered_inference(X_init, y, ward, n_clusters, method="lll") diff --git a/hidimstat/test/test_dcrt.py b/hidimstat/test/test_dcrt.py index ac2b2061..da5af8fd 100644 --- a/hidimstat/test/test_dcrt.py +++ b/hidimstat/test/test_dcrt.py @@ -36,26 +36,26 @@ def test_dcrt_lasso(): statistic="test", random_state=2024, ) - + # Checking for bad selection of screening_threshold result_th_screen_bad = dcrt_zero( - X, - y, - screening_threshold = 0, - screening=True, - verbose=False, - ) + X, + y, + screening_threshold=0, + screening=True, + verbose=False, + ) assert result_th_screen_bad.size == 0 # Checking for bad selection of screening_threshold with verbose result_th_screen_bad = dcrt_zero( - X, - y, - screening_threshold = 0, - screening=True, - verbose=True, + X, + y, + screening_threshold=0, + screening=True, + verbose=True, ) - + assert len(result_th_screen_bad) == 3 assert result_th_screen_bad[0].size == 0 assert np.all(result_th_screen_bad[1] == np.ones(10)) @@ -95,7 +95,7 @@ def test_dcrt_lasso(): y, refit=True, screening=True, - screening_threshold = 50, + screening_threshold=50, verbose=True, statistic="residual", random_state=2024, diff --git a/hidimstat/test/test_desparsified_lasso.py b/hidimstat/test/test_desparsified_lasso.py index 007c26cc..69e26dbe 100644 --- a/hidimstat/test/test_desparsified_lasso.py +++ b/hidimstat/test/test_desparsified_lasso.py @@ -52,10 +52,11 @@ def test_desparsified_lasso(): def test_desparsified_lasso_exception(): """Testing exception of not using lasso""" - X, y, beta, noise = multivariate_1D_simulation( - ) - with pytest.raises(ValueError, match="The only regression method available is 'lasso'"): - _ = desparsified_lasso(X, y, residual_method='test') + X, y, beta, noise = multivariate_1D_simulation() + with pytest.raises( + ValueError, match="The only regression method available is 'lasso'" + ): + _ = desparsified_lasso(X, y, residual_method="test") def test_desparsified_group_lasso(): diff --git a/hidimstat/test/test_ensemble_clustered_inference.py b/hidimstat/test/test_ensemble_clustered_inference.py index 39381108..0d59e221 100644 --- a/hidimstat/test/test_ensemble_clustered_inference.py +++ b/hidimstat/test/test_ensemble_clustered_inference.py @@ -135,7 +135,7 @@ def test_ensemble_clustered_inference(): n_clusters, n_bootstraps=n_bootstraps, inference_method=inference_method, - ensembling_method="medians" + ensembling_method="medians", ) ) @@ -150,8 +150,6 @@ def test_ensemble_clustered_inference(): ) - - def test_ensemble_clustered_inference_exception(): """ Test the raise of exception @@ -159,30 +157,23 @@ def test_ensemble_clustered_inference_exception(): n_samples, n_features = 100, 2000 n_clusters = 10 X, Y, beta, epsilon = multivariate_1D_simulation( - n_samples=n_samples, n_features=n_features, + n_samples=n_samples, + n_features=n_features, ) connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) ward = FeatureAgglomeration( n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - + # Test the raise of exception with pytest.raises(ValueError, match="Unknown ensembling method."): ensemble_clustered_inference( - X, - Y, - ward, - n_clusters, - ensembling_method="wrong_method" + X, Y, ward, n_clusters, ensembling_method="wrong_method" ) - - - with pytest.raises(ValueError, match="'memory' must be None or a string corresponding " - + "to the path of the caching directory."): - ensemble_clustered_inference( - X, - Y, - ward, - n_clusters, - memory=[] - ) \ No newline at end of file + + with pytest.raises( + ValueError, + match="'memory' must be None or a string corresponding " + + "to the path of the caching directory.", + ): + ensemble_clustered_inference(X, Y, ward, n_clusters, memory=[]) diff --git a/hidimstat/test/test_generate_knockoff.py b/hidimstat/test/test_generate_knockoff.py index 44430ec7..ef858f67 100644 --- a/hidimstat/test/test_generate_knockoff.py +++ b/hidimstat/test/test_generate_knockoff.py @@ -9,7 +9,6 @@ import pytest - def test_estimate_distribution_wolf(): SEED = 42 n = 100 @@ -31,12 +30,15 @@ def test_estimate_distribution_lasso(): assert mu.size == p assert Sigma.shape == (p, p) + def test_gaussian_knockoff_estimate_exception(): SEED = 42 n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=SEED) - with pytest.raises(ValueError, match="test is not a valid covariance estimated method"): + with pytest.raises( + ValueError, match="test is not a valid covariance estimated method" + ): _estimate_distribution(X, shrink=True, cov_estimator="test") @@ -59,6 +61,7 @@ def test_gaussian_knockoff_exception(): X, y, _, non_zero = simu_data(n, p, seed=SEED) mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") - with pytest.raises(ValueError, match="test is not a valid knockoff contriction method"): + with pytest.raises( + ValueError, match="test is not a valid knockoff contriction method" + ): gaussian_knockoff_generation(X, mu, Sigma, method="test", seed=SEED * 2) - diff --git a/hidimstat/test/test_knockoff_aggregation.py b/hidimstat/test/test_knockoff_aggregation.py index 992ef6dd..67d5628c 100644 --- a/hidimstat/test/test_knockoff_aggregation.py +++ b/hidimstat/test/test_knockoff_aggregation.py @@ -54,8 +54,10 @@ def test_knockoff_aggregation(): assert fdp_verbose < 0.5 assert power_verbose > 0.1 - selected_ko, test_scored, thres, X_tilde = model_x_knockoff(X, y, fdr=fdr, seed=5, verbose=True) - #TODO add tests for the 3 other variables + selected_ko, test_scored, thres, X_tilde = model_x_knockoff( + X, y, fdr=fdr, seed=5, verbose=True + ) + # TODO add tests for the 3 other variables np.testing.assert_array_equal(selected_no_verbose, selected_ko) @@ -67,8 +69,6 @@ def test_knockoff_aggregation(): assert fdp_verbose < 0.5 assert power_verbose > 0.1 - - # Using e-values aggregation (verbose vs no verbose) selected_verbose, aggregated_pval, pvals = knockoff_aggregation( diff --git a/hidimstat/test/test_loco.py b/hidimstat/test/test_loco.py index 67927513..a6eeaf00 100644 --- a/hidimstat/test/test_loco.py +++ b/hidimstat/test/test_loco.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest -from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import log_loss diff --git a/hidimstat/test/test_model_x_knockoff.py b/hidimstat/test/test_model_x_knockoff.py index 6caa3902..fe56f887 100644 --- a/hidimstat/test/test_model_x_knockoff.py +++ b/hidimstat/test/test_model_x_knockoff.py @@ -22,12 +22,12 @@ def test_model_x_knockoff_with_verbose(): n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) - ko_result, test_scored, thres, X_tilde = model_x_knockoff(X, y, fdr=fdr, seed=5, verbose=True) - #TODO add tests for the 3 other variables + ko_result, test_scored, thres, X_tilde = model_x_knockoff( + X, y, fdr=fdr, seed=5, verbose=True + ) + # TODO add tests for the 3 other variables fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 - - diff --git a/hidimstat/test/test_noise_std.py b/hidimstat/test/test_noise_std.py index a5cd94b7..6766adea 100644 --- a/hidimstat/test/test_noise_std.py +++ b/hidimstat/test/test_noise_std.py @@ -143,13 +143,17 @@ def test_group_reid_exception(): n_times=n_times, ) # max_iter=1 to get a better coverage - with pytest.raises(ValueError, match= "The requested AR order is to high with " - + "respect to the number of time steps." - ): - group_reid(X, Y, stationary=True, method="AR", order = 100) - - with pytest.raises(ValueError, match= "Unknown method for estimating the covariance matrix"): - group_reid(X, Y, method="test", order = 100) + with pytest.raises( + ValueError, + match="The requested AR order is to high with " + + "respect to the number of time steps.", + ): + group_reid(X, Y, stationary=True, method="AR", order=100) + + with pytest.raises( + ValueError, match="Unknown method for estimating the covariance matrix" + ): + group_reid(X, Y, method="test", order=100) def test_empirical_snr(): diff --git a/hidimstat/test/test_permutation_test.py b/hidimstat/test/test_permutation_test.py index c50162dc..dc94fecf 100644 --- a/hidimstat/test/test_permutation_test.py +++ b/hidimstat/test/test_permutation_test.py @@ -39,6 +39,7 @@ def test_permutation_test(): assert_almost_equal(pval_corr, expected, decimal=1) + def test_permutation_test_with_SVR(): """Testing the procedure on a simulation with no structure and a support of size 1. Computing one-sided p-values, we want a low p-value @@ -62,9 +63,11 @@ def test_permutation_test_with_SVR(): y = y - np.mean(y) X_init = X_init - np.mean(X_init, axis=0) - pval_corr, one_minus_pval_corr = permutation_test_cv(X_init, y, n_permutations=100, C=0.1) + pval_corr, one_minus_pval_corr = permutation_test_cv( + X_init, y, n_permutations=100, C=0.1 + ) expected = 0.5 * np.ones(n_features) expected[:support_size] = 0.0 - assert_almost_equal(pval_corr, expected, decimal=1) \ No newline at end of file + assert_almost_equal(pval_corr, expected, decimal=1) diff --git a/hidimstat/test/test_scenario.py b/hidimstat/test/test_scenario.py index 4441ffc3..e6d706a6 100644 --- a/hidimstat/test/test_scenario.py +++ b/hidimstat/test/test_scenario.py @@ -56,9 +56,11 @@ def test_multivariate_1D_simulation(): X, y, beta, noise = multivariate_1D_simulation() rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] assert_almost_equal(rho_hat, 0, decimal=1) - + # Test 3 - X, y, beta, noise = multivariate_1D_simulation(n_samples=9, nb_group=2, group_size=3) + X, y, beta, noise = multivariate_1D_simulation( + n_samples=9, nb_group=2, group_size=3 + ) corr_X = np.corrcoef(X) assert_almost_equal(corr_X[0, 0], 1, decimal=1) assert_almost_equal(corr_X[0, 1], 1, decimal=1) @@ -78,18 +80,22 @@ def test_multivariate_1D_simulation(): assert_almost_equal(corr_X[5, 3], 1, decimal=1) assert_almost_equal(corr_X[5, 4], 1, decimal=1) assert_almost_equal(corr_X[5, 5], 1, decimal=1) - - + def test_multivariate_1D_simulation_exception(): """ Test when the input paramters is not correct. """ - with pytest.raises(ValueError, match="The number of groups and their size must be positive."): + with pytest.raises( + ValueError, match="The number of groups and their size must be positive." + ): multivariate_1D_simulation(nb_group=-1) - - with pytest.raises(ValueError, match="The number of samples is too small compate to the number " - "of group and their size to gerate the data."): + + with pytest.raises( + ValueError, + match="The number of samples is too small compate to the number " + "of group and their size to gerate the data.", + ): multivariate_1D_simulation(n_samples=10, nb_group=2, group_size=6) @@ -198,5 +204,3 @@ def test_multivariate_temporal_simulation(): rho_data_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] assert_almost_equal(rho_data_hat, rho_data, decimal=1) assert_equal(Y, np.dot(X, beta) + noise) - - diff --git a/hidimstat/test/test_utils.py b/hidimstat/test/test_utils.py index 24bbf74e..f6efe8b0 100644 --- a/hidimstat/test/test_utils.py +++ b/hidimstat/test/test_utils.py @@ -15,22 +15,32 @@ def test_quantile_aggregation(): gamma_min = 0.05 assert_array_almost_equal(0.1 * quantile_aggregation(p_values, 0.1), [0.01] * 10) - assert_array_almost_equal(0.1 * quantile_aggregation(p_values, 0.1, adaptive=True, gamma_min=gamma_min) - / (1 - np.log(gamma_min)), [0.01] * 10) + assert_array_almost_equal( + 0.1 + * quantile_aggregation(p_values, 0.1, adaptive=True, gamma_min=gamma_min) + / (1 - np.log(gamma_min)), + [0.01] * 10, + ) assert_array_almost_equal(0.3 * quantile_aggregation(p_values, 0.3), [0.03] * 10) - assert_array_almost_equal(0.3 * quantile_aggregation(p_values, 0.3, adaptive=True, gamma_min=gamma_min) - / (1 - np.log(gamma_min)), [0.03] * 10) + assert_array_almost_equal( + 0.3 + * quantile_aggregation(p_values, 0.3, adaptive=True, gamma_min=gamma_min) + / (1 - np.log(gamma_min)), + [0.03] * 10, + ) assert_array_almost_equal(0.5 * quantile_aggregation(p_values, 0.5), [0.05] * 10) - assert_array_almost_equal(0.5 * quantile_aggregation(p_values, 0.5, adaptive=True, gamma_min=gamma_min) - / (1 - np.log(gamma_min)), [0.05] * 10) + assert_array_almost_equal( + 0.5 + * quantile_aggregation(p_values, 0.5, adaptive=True, gamma_min=gamma_min) + / (1 - np.log(gamma_min)), + [0.05] * 10, + ) # One p-value within the quantile aggregation method p_values = np.array([0.0]) assert quantile_aggregation(p_values) == 0.0 assert quantile_aggregation(p_values, adaptive=True) == 0.0 - - def test_fdr_threshold(): @@ -84,7 +94,7 @@ def test_cal_fdp_power(): assert power == 0 # test for shift in index - fdp, power = cal_fdp_power(selected+1, non_zero_index, r_index=True) - + fdp, power = cal_fdp_power(selected + 1, non_zero_index, r_index=True) + assert fdp == 2 / len(selected) - assert power == 18 / len(non_zero_index) \ No newline at end of file + assert power == 18 / len(non_zero_index) diff --git a/hidimstat/utils.py b/hidimstat/utils.py index 418ee85a..0c1abc44 100644 --- a/hidimstat/utils.py +++ b/hidimstat/utils.py @@ -147,7 +147,7 @@ def _fixed_quantile_aggregation(pvals, gamma=0.5): def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): """adaptive version of the quantile aggregation method, Meinshausen et al. (2008)""" - gammas = np.linspace(gamma_min, 1., 30) + gammas = np.linspace(gamma_min, 1.0, 30) list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) From a830f047cdd45a21f7a3858b123845ebea9a3736 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 18 Dec 2024 15:35:25 +0100 Subject: [PATCH 12/26] Fix syntax --- hidimstat/scenario.py | 12 ++++---- hidimstat/test/test_generate_knockoff.py | 36 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py index 88130c9b..0286b560 100644 --- a/hidimstat/scenario.py +++ b/hidimstat/scenario.py @@ -16,7 +16,7 @@ def multivariate_1D_simulation( rho=0.0, shuffle=True, seed=0, - nb_group=0, + n_groups=0, group_size=0, ): """Generate 1D data with Toeplitz design matrix @@ -44,7 +44,7 @@ def multivariate_1D_simulation( seed : int Seed used for generating design matrix and noise. - nb_group : int + n_groups : int Number of groups. group_size : int @@ -67,9 +67,9 @@ def multivariate_1D_simulation( """ rng = np.random.default_rng(seed) - if nb_group < 0 or group_size < 0: + if n_groups < 0 or group_size < 0: raise ValueError("The number of groups and their size must be positive.") - n_generate_samples = n_samples - nb_group * group_size + n_generate_samples = n_samples - n_groups * group_size if n_generate_samples <= 0: raise ValueError( "The number of samples is too small compate to the number " @@ -85,8 +85,8 @@ def multivariate_1D_simulation( if shuffle: rng.shuffle(X.T) - if nb_group > 0: - groups = np.repeat(X[:nb_group], group_size, axis=0) + if n_groups > 0: + groups = np.repeat(X[:n_groups], group_size, axis=0) X = np.vstack((groups, X)) beta = np.zeros(n_features) diff --git a/hidimstat/test/test_generate_knockoff.py b/hidimstat/test/test_generate_knockoff.py index ef858f67..231e8486 100644 --- a/hidimstat/test/test_generate_knockoff.py +++ b/hidimstat/test/test_generate_knockoff.py @@ -10,32 +10,32 @@ def test_estimate_distribution_wolf(): - SEED = 42 + seed = 42 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, cov_estimator="ledoit_wolf") + X, _, _, _ = simu_data(n, p, seed=seed) + mu, sigma = _estimate_distribution(X, cov_estimator="ledoit_wolf") assert mu.size == p - assert Sigma.shape == (p, p) + assert sigma.shape == (p, p) def test_estimate_distribution_lasso(): - SEED = 42 + seed = 42 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="graph_lasso") + X, _, _, _ = simu_data(n, p, seed=seed) + mu, sigma = _estimate_distribution(X, shrink=True, cov_estimator="graph_lasso") assert mu.size == p - assert Sigma.shape == (p, p) + assert sigma.shape == (p, p) def test_gaussian_knockoff_estimate_exception(): - SEED = 42 + seed = 42 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) + X, _, _, _ = simu_data(n, p, seed=seed) with pytest.raises( ValueError, match="test is not a valid covariance estimated method" ): @@ -43,25 +43,25 @@ def test_gaussian_knockoff_estimate_exception(): def test_gaussian_knockoff_equi(): - SEED = 42 + seed = 42 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") + X, _, _, _ = simu_data(n, p, seed=seed) + mu, sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") - X_tilde = gaussian_knockoff_generation(X, mu, Sigma, method="equi", seed=SEED * 2) + X_tilde = gaussian_knockoff_generation(X, mu, sigma, method="equi", seed=seed * 2) assert X_tilde.shape == (n, p) def test_gaussian_knockoff_exception(): - SEED = 42 + seed = 42 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") + X, _, _, _ = simu_data(n, p, seed=seed) + mu, sigma = _estimate_distribution(X, shrink=True, cov_estimator="ledoit_wolf") with pytest.raises( ValueError, match="test is not a valid knockoff contriction method" ): - gaussian_knockoff_generation(X, mu, Sigma, method="test", seed=SEED * 2) + gaussian_knockoff_generation(X, mu, sigma, method="test", seed=seed * 2) From 63e6c540bbac6735f4d3dd75301fa2b63f2d03d4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 18 Dec 2024 15:43:56 +0100 Subject: [PATCH 13/26] Fix bugs by changing name of groups --- hidimstat/test/test_clustered_inference.py | 6 +++--- hidimstat/test/test_scenario.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py index 490b5365..4a726367 100644 --- a/hidimstat/test/test_clustered_inference.py +++ b/hidimstat/test/test_clustered_inference.py @@ -131,7 +131,7 @@ def test_clustered_inference_no_temporal_groups(): n_samples, n_features = 100, 2000 support_size = 10 - nb_groups = 9 + n_groupss = 9 size = 10 sigma = 5.0 rho = 0.95 @@ -148,10 +148,10 @@ def test_clustered_inference_no_temporal_groups(): rho=rho, shuffle=False, seed=2, - nb_group=nb_groups, + n_groups=n_groupss, group_size=size, ) - groups = np.concatenate([[i] * size for i in range(nb_groups + 1)]) + groups = np.concatenate([[i] * size for i in range(n_groupss + 1)]) y = y - np.mean(y) X_init = X_init - np.mean(X_init, axis=0) diff --git a/hidimstat/test/test_scenario.py b/hidimstat/test/test_scenario.py index e6d706a6..c17794ad 100644 --- a/hidimstat/test/test_scenario.py +++ b/hidimstat/test/test_scenario.py @@ -59,7 +59,7 @@ def test_multivariate_1D_simulation(): # Test 3 X, y, beta, noise = multivariate_1D_simulation( - n_samples=9, nb_group=2, group_size=3 + n_samples=9, n_groups=2, group_size=3 ) corr_X = np.corrcoef(X) assert_almost_equal(corr_X[0, 0], 1, decimal=1) @@ -89,14 +89,14 @@ def test_multivariate_1D_simulation_exception(): with pytest.raises( ValueError, match="The number of groups and their size must be positive." ): - multivariate_1D_simulation(nb_group=-1) + multivariate_1D_simulation(n_groups=-1) with pytest.raises( ValueError, match="The number of samples is too small compate to the number " "of group and their size to gerate the data.", ): - multivariate_1D_simulation(n_samples=10, nb_group=2, group_size=6) + multivariate_1D_simulation(n_samples=10, n_groups=2, group_size=6) def test_multivariate_simulation(): From 3dfc30ee4066b594b3be7662646204c85816f228 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 10:37:43 +0100 Subject: [PATCH 14/26] Modified scenario with group --- hidimstat/scenario.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py index 0286b560..1defd92d 100644 --- a/hidimstat/scenario.py +++ b/hidimstat/scenario.py @@ -67,17 +67,22 @@ def multivariate_1D_simulation( """ rng = np.random.default_rng(seed) + if n_groups < 0 or group_size < 0: raise ValueError("The number of groups and their size must be positive.") - n_generate_samples = n_samples - n_groups * group_size - if n_generate_samples <= 0: + + n_individual_samples = n_samples - n_groups * group_size + if n_individual_samples <= 0: raise ValueError( "The number of samples is too small compate to the number " "of group and their size to gerate the data." ) + + n_generate_samples = n_groups + n_individual_samples + + # generate random data for each samples X = np.zeros((n_generate_samples, n_features)) X[:, 0] = rng.standard_normal(n_generate_samples) - for i in np.arange(1, n_features): rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_generate_samples) X[:, i] = rho * X[:, i - 1] + rand_vector @@ -85,13 +90,16 @@ def multivariate_1D_simulation( if shuffle: rng.shuffle(X.T) + # generate data for the groups based on one sample if n_groups > 0: groups = np.repeat(X[:n_groups], group_size, axis=0) - X = np.vstack((groups, X)) + X = np.vstack((groups, X[n_groups:])) + # generate the vector of variable of importances beta = np.zeros(n_features) beta[0:support_size] = 1.0 + # generate the simulated regression data noise = sigma * rng.standard_normal(n_samples) y = np.dot(X, beta) + noise From 369ccdb7282cb4851a4c1d5dcfde6f68df39188c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 19 Dec 2024 10:39:23 +0100 Subject: [PATCH 15/26] Fix typo --- hidimstat/scenario.py | 10 +++++----- hidimstat/test/test_clustered_inference.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hidimstat/scenario.py b/hidimstat/scenario.py index 1defd92d..60f15d77 100644 --- a/hidimstat/scenario.py +++ b/hidimstat/scenario.py @@ -67,19 +67,19 @@ def multivariate_1D_simulation( """ rng = np.random.default_rng(seed) - + if n_groups < 0 or group_size < 0: raise ValueError("The number of groups and their size must be positive.") - + n_individual_samples = n_samples - n_groups * group_size if n_individual_samples <= 0: raise ValueError( "The number of samples is too small compate to the number " "of group and their size to gerate the data." ) - + n_generate_samples = n_groups + n_individual_samples - + # generate random data for each samples X = np.zeros((n_generate_samples, n_features)) X[:, 0] = rng.standard_normal(n_generate_samples) @@ -90,7 +90,7 @@ def multivariate_1D_simulation( if shuffle: rng.shuffle(X.T) - # generate data for the groups based on one sample + # generate data for the groups based on one sample if n_groups > 0: groups = np.repeat(X[:n_groups], group_size, axis=0) X = np.vstack((groups, X[n_groups:])) diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py index 4a726367..12149597 100644 --- a/hidimstat/test/test_clustered_inference.py +++ b/hidimstat/test/test_clustered_inference.py @@ -131,7 +131,7 @@ def test_clustered_inference_no_temporal_groups(): n_samples, n_features = 100, 2000 support_size = 10 - n_groupss = 9 + n_groups = 9 size = 10 sigma = 5.0 rho = 0.95 @@ -148,10 +148,10 @@ def test_clustered_inference_no_temporal_groups(): rho=rho, shuffle=False, seed=2, - n_groups=n_groupss, + n_groups=n_groups, group_size=size, ) - groups = np.concatenate([[i] * size for i in range(n_groupss + 1)]) + groups = np.concatenate([[i] * size for i in range(n_groups + 1)]) y = y - np.mean(y) X_init = X_init - np.mean(X_init, axis=0) From 7e03524e781f03a523d72e352bd332d509606292 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 6 Jan 2025 18:10:06 +0100 Subject: [PATCH 16/26] don't use global variable --- hidimstat/test/test_knockoff_aggregation.py | 14 +++++++------- hidimstat/test/test_model_x_knockoff.py | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/hidimstat/test/test_knockoff_aggregation.py b/hidimstat/test/test_knockoff_aggregation.py index 67d5628c..49fa7798 100644 --- a/hidimstat/test/test_knockoff_aggregation.py +++ b/hidimstat/test/test_knockoff_aggregation.py @@ -4,15 +4,15 @@ import numpy as np import pytest -n = 500 -p = 100 -snr = 5 -n_bootstraps = 25 -fdr = 0.5 -X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) - def test_knockoff_aggregation(): + n = 500 + p = 100 + snr = 5 + n_bootstraps = 25 + fdr = 0.5 + X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) + selected_verbose, aggregated_pval, pvals = knockoff_aggregation( X, y, fdr=fdr, n_bootstraps=n_bootstraps, verbose=True, random_state=0 ) diff --git a/hidimstat/test/test_model_x_knockoff.py b/hidimstat/test/test_model_x_knockoff.py index fe56f887..c4cb1b7e 100644 --- a/hidimstat/test/test_model_x_knockoff.py +++ b/hidimstat/test/test_model_x_knockoff.py @@ -2,12 +2,10 @@ from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power -seed = 42 -fdr = 0.2 - def test_model_x_knockoff(): - + seed = 42 + fdr = 0.2 n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) @@ -19,6 +17,8 @@ def test_model_x_knockoff(): def test_model_x_knockoff_with_verbose(): + seed = 42 + fdr = 0.2 n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) From 794310efd5c7f1c47384ae79175c75a03bcfcca9 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Tue, 7 Jan 2025 15:05:38 +0100 Subject: [PATCH 17/26] Apply suggestions from code review improvement Co-authored-by: bthirion Co-authored-by: Joseph Paillard --- src/hidimstat/scenario.py | 4 ++-- test/test_scenario.py | 23 +++++------------------ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/hidimstat/scenario.py b/src/hidimstat/scenario.py index f147e5ad..c07dfdbf 100644 --- a/src/hidimstat/scenario.py +++ b/src/hidimstat/scenario.py @@ -54,7 +54,7 @@ def multivariate_1D_simulation( ------- X : ndarray, shape (n_samples, n_features) Design matrix. - if their is some groups, the first rows contains the groups values. + if there are some groups, the first rows contains the group index y : ndarray, shape (n_samples,) Target. @@ -74,7 +74,7 @@ def multivariate_1D_simulation( n_individual_samples = n_samples - n_groups * group_size if n_individual_samples <= 0: raise ValueError( - "The number of samples is too small compate to the number " + "The number of samples is too small compared to the number " "of group and their size to gerate the data." ) diff --git a/test/test_scenario.py b/test/test_scenario.py index c17794ad..08be5e06 100644 --- a/test/test_scenario.py +++ b/test/test_scenario.py @@ -62,24 +62,11 @@ def test_multivariate_1D_simulation(): n_samples=9, n_groups=2, group_size=3 ) corr_X = np.corrcoef(X) - assert_almost_equal(corr_X[0, 0], 1, decimal=1) - assert_almost_equal(corr_X[0, 1], 1, decimal=1) - assert_almost_equal(corr_X[0, 2], 1, decimal=1) - assert_almost_equal(corr_X[1, 0], 1, decimal=1) - assert_almost_equal(corr_X[1, 1], 1, decimal=1) - assert_almost_equal(corr_X[1, 2], 1, decimal=1) - assert_almost_equal(corr_X[2, 0], 1, decimal=1) - assert_almost_equal(corr_X[2, 1], 1, decimal=1) - assert_almost_equal(corr_X[2, 2], 1, decimal=1) - assert_almost_equal(corr_X[3, 3], 1, decimal=1) - assert_almost_equal(corr_X[3, 4], 1, decimal=1) - assert_almost_equal(corr_X[3, 5], 1, decimal=1) - assert_almost_equal(corr_X[4, 3], 1, decimal=1) - assert_almost_equal(corr_X[4, 4], 1, decimal=1) - assert_almost_equal(corr_X[4, 5], 1, decimal=1) - assert_almost_equal(corr_X[5, 3], 1, decimal=1) - assert_almost_equal(corr_X[5, 4], 1, decimal=1) - assert_almost_equal(corr_X[5, 5], 1, decimal=1) + from itertools import product + groups = [[0, 1, 2], [3, 4, 5]] + for g in groups: + for i, j in product(g, g): + assert_almost_equal(corr_X[i, j], 1, decimal=1) def test_multivariate_1D_simulation_exception(): From 9c69a8fffec31cb201265c740ecdafd3fb90e8c7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 7 Jan 2025 10:20:55 +0100 Subject: [PATCH 18/26] Change pvalue to evalue --- src/hidimstat/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py index 0c1abc44..daf4e276 100644 --- a/src/hidimstat/utils.py +++ b/src/hidimstat/utils.py @@ -84,17 +84,17 @@ def _bhq_threshold(pvals, fdr=0.1): return -1.0 -def _ebh_threshold(pvals, fdr=0.1): +def _ebh_threshold(evals, fdr=0.1): """e-BH procedure for FDR control (see Wang and Ramdas 2021)""" - n_features = len(pvals) - pvals_sorted = -np.sort(-pvals) # sort in descending order + n_features = len(evals) + evals_sorted = -np.sort(-evals) # sort in descending order selected_index = 2 * n_features for i in range(n_features - 1, -1, -1): - if pvals_sorted[i] >= n_features / (fdr * (i + 1)): + if evals_sorted[i] >= n_features / (fdr * (i + 1)): selected_index = i break if selected_index <= n_features: - return pvals_sorted[selected_index] + return evals_sorted[selected_index] else: return np.inf From 12edb2342c42c97ec10691a94cc43a4978a176cb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 7 Jan 2025 15:15:55 +0100 Subject: [PATCH 19/26] Fix bugs and typo --- test/test_scenario.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_scenario.py b/test/test_scenario.py index 08be5e06..11a35c21 100644 --- a/test/test_scenario.py +++ b/test/test_scenario.py @@ -63,6 +63,7 @@ def test_multivariate_1D_simulation(): ) corr_X = np.corrcoef(X) from itertools import product + groups = [[0, 1, 2], [3, 4, 5]] for g in groups: for i, j in product(g, g): @@ -80,7 +81,7 @@ def test_multivariate_1D_simulation_exception(): with pytest.raises( ValueError, - match="The number of samples is too small compate to the number " + match="The number of samples is too small compared to the number " "of group and their size to gerate the data.", ): multivariate_1D_simulation(n_samples=10, n_groups=2, group_size=6) From e06122f2cce53c5074bd4f3a5a9d4e3f5af7e2d1 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 17 Jan 2025 10:34:10 +0100 Subject: [PATCH 20/26] Apply suggestions from code review change position of import Co-authored-by: Joseph Paillard --- test/test_scenario.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_scenario.py b/test/test_scenario.py index 11a35c21..12b3533d 100644 --- a/test/test_scenario.py +++ b/test/test_scenario.py @@ -1,7 +1,7 @@ """ Test the scenario module """ - +from itertools import product import numpy as np import pytest from numpy.testing import assert_almost_equal, assert_equal @@ -62,7 +62,6 @@ def test_multivariate_1D_simulation(): n_samples=9, n_groups=2, group_size=3 ) corr_X = np.corrcoef(X) - from itertools import product groups = [[0, 1, 2], [3, 4, 5]] for g in groups: From cee3f9afbcf4c348037511aec2cec8b78abcad97 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 17 Jan 2025 10:37:41 +0100 Subject: [PATCH 21/26] Update test_scenario.py Formatting the file --- test/test_scenario.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_scenario.py b/test/test_scenario.py index 12b3533d..28d27618 100644 --- a/test/test_scenario.py +++ b/test/test_scenario.py @@ -1,6 +1,7 @@ """ Test the scenario module """ + from itertools import product import numpy as np import pytest From 0937e9cb719614519320fc7abb5bee00ffacde4c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 18:49:48 +0100 Subject: [PATCH 22/26] Reverse the option group --- src/hidimstat/scenario.py | 31 +++------------------- test/test_clustered_inference.py | 45 ++++++++++++++++++-------------- test/test_scenario.py | 28 -------------------- 3 files changed, 29 insertions(+), 75 deletions(-) diff --git a/src/hidimstat/scenario.py b/src/hidimstat/scenario.py index c07dfdbf..1bc18be0 100644 --- a/src/hidimstat/scenario.py +++ b/src/hidimstat/scenario.py @@ -16,8 +16,6 @@ def multivariate_1D_simulation( rho=0.0, shuffle=True, seed=0, - n_groups=0, - group_size=0, ): """Generate 1D data with Toeplitz design matrix @@ -44,17 +42,10 @@ def multivariate_1D_simulation( seed : int Seed used for generating design matrix and noise. - n_groups : int - Number of groups. - - group_size : int - Size of each group. - Returns ------- X : ndarray, shape (n_samples, n_features) Design matrix. - if there are some groups, the first rows contains the group index y : ndarray, shape (n_samples,) Target. @@ -68,33 +59,17 @@ def multivariate_1D_simulation( rng = np.random.default_rng(seed) - if n_groups < 0 or group_size < 0: - raise ValueError("The number of groups and their size must be positive.") - - n_individual_samples = n_samples - n_groups * group_size - if n_individual_samples <= 0: - raise ValueError( - "The number of samples is too small compared to the number " - "of group and their size to gerate the data." - ) - - n_generate_samples = n_groups + n_individual_samples # generate random data for each samples - X = np.zeros((n_generate_samples, n_features)) - X[:, 0] = rng.standard_normal(n_generate_samples) + X = np.zeros((n_samples, n_features)) + X[:, 0] = rng.standard_normal(n_samples) for i in np.arange(1, n_features): - rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_generate_samples) + rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_samples) X[:, i] = rho * X[:, i - 1] + rand_vector if shuffle: rng.shuffle(X.T) - # generate data for the groups based on one sample - if n_groups > 0: - groups = np.repeat(X[:n_groups], group_size, axis=0) - X = np.vstack((groups, X[n_groups:])) - # generate the vector of variable of importances beta = np.zeros(n_features) beta[0:support_size] = 1.0 diff --git a/test/test_clustered_inference.py b/test/test_clustered_inference.py index 12149597..221647e2 100644 --- a/test/test_clustered_inference.py +++ b/test/test_clustered_inference.py @@ -14,6 +14,8 @@ multivariate_temporal_simulation, ) +from sklearn.datasets import make_multilabel_classification + # Scenario 1: data with no temporal dimension def test_clustered_inference_no_temporal(): @@ -129,10 +131,9 @@ def test_clustered_inference_no_temporal_groups(): the support and p-values close to 0.5 for the others. """ - n_samples, n_features = 100, 2000 + n_samples, n_features = 20, 2000 support_size = 10 - n_groups = 9 - size = 10 + n_groups = 10 sigma = 5.0 rho = 0.95 n_clusters = 200 @@ -140,21 +141,27 @@ def test_clustered_inference_no_temporal_groups(): interior_support = support_size - margin_size extended_support = support_size + margin_size - X_init, y, beta, epsilon = multivariate_1D_simulation( - n_samples=n_samples, - n_features=n_features, - support_size=support_size, - sigma=sigma, - rho=rho, - shuffle=False, - seed=2, - n_groups=n_groups, - group_size=size, - ) - groups = np.concatenate([[i] * size for i in range(n_groups + 1)]) - - y = y - np.mean(y) - X_init = X_init - np.mean(X_init, axis=0) + # create n_group of samples + X_ = [] + y_ = [] + for i in range(n_groups): + X_init, y, beta, epsilon = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + sigma=sigma, + rho=rho, + shuffle=False, + seed=2 + i, + ) + X_.append(X_init) + y_.append(y) + + y_ = np.concatenate(y_) + y_ = y_ - np.mean(y_) + X_ = np.concatenate(X_) + X_ = X_ - np.mean(X_, axis=0) + groups = np.repeat(np.arange(0, n_groups), n_samples) connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) ward = FeatureAgglomeration( @@ -162,7 +169,7 @@ def test_clustered_inference_no_temporal_groups(): ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference(X_init, y, ward, n_clusters, groups=groups) + clustered_inference(X_, y_, ward, n_clusters, groups=groups) ) expected = 0.5 * np.ones(n_features) diff --git a/test/test_scenario.py b/test/test_scenario.py index 28d27618..904dda48 100644 --- a/test/test_scenario.py +++ b/test/test_scenario.py @@ -58,34 +58,6 @@ def test_multivariate_1D_simulation(): rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] assert_almost_equal(rho_hat, 0, decimal=1) - # Test 3 - X, y, beta, noise = multivariate_1D_simulation( - n_samples=9, n_groups=2, group_size=3 - ) - corr_X = np.corrcoef(X) - - groups = [[0, 1, 2], [3, 4, 5]] - for g in groups: - for i, j in product(g, g): - assert_almost_equal(corr_X[i, j], 1, decimal=1) - - -def test_multivariate_1D_simulation_exception(): - """ - Test when the input paramters is not correct. - """ - with pytest.raises( - ValueError, match="The number of groups and their size must be positive." - ): - multivariate_1D_simulation(n_groups=-1) - - with pytest.raises( - ValueError, - match="The number of samples is too small compared to the number " - "of group and their size to gerate the data.", - ): - multivariate_1D_simulation(n_samples=10, n_groups=2, group_size=6) - def test_multivariate_simulation(): """Test if the data has expected shape, if the input parameters From 852106f9c329bbe0d0d42ebdc4c5421119093795 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 18:53:47 +0100 Subject: [PATCH 23/26] Remove unecessesary import --- test/test_clustered_inference.py | 2 -- test/test_scenario.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/test/test_clustered_inference.py b/test/test_clustered_inference.py index 221647e2..8b4d501a 100644 --- a/test/test_clustered_inference.py +++ b/test/test_clustered_inference.py @@ -14,8 +14,6 @@ multivariate_temporal_simulation, ) -from sklearn.datasets import make_multilabel_classification - # Scenario 1: data with no temporal dimension def test_clustered_inference_no_temporal(): diff --git a/test/test_scenario.py b/test/test_scenario.py index 904dda48..503f6848 100644 --- a/test/test_scenario.py +++ b/test/test_scenario.py @@ -2,9 +2,7 @@ Test the scenario module """ -from itertools import product import numpy as np -import pytest from numpy.testing import assert_almost_equal, assert_equal from hidimstat.scenario import ( From 943fe11ea0046a1297a2ceb0420944ea178b9219 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 18:55:38 +0100 Subject: [PATCH 24/26] Format file --- src/hidimstat/scenario.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hidimstat/scenario.py b/src/hidimstat/scenario.py index 1bc18be0..e9bde2dc 100644 --- a/src/hidimstat/scenario.py +++ b/src/hidimstat/scenario.py @@ -59,7 +59,6 @@ def multivariate_1D_simulation( rng = np.random.default_rng(seed) - # generate random data for each samples X = np.zeros((n_samples, n_features)) X[:, 0] = rng.standard_normal(n_samples) From 85b44a357e6a546d50fe1524d00ea7f1686300e1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 20 Jan 2025 13:27:02 +0100 Subject: [PATCH 25/26] small difference --- src/hidimstat/utils.py | 2 +- test/test_generate_knockoff.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py index daf4e276..1c67e1b2 100644 --- a/src/hidimstat/utils.py +++ b/src/hidimstat/utils.py @@ -96,7 +96,7 @@ def _ebh_threshold(evals, fdr=0.1): if selected_index <= n_features: return evals_sorted[selected_index] else: - return np.inf + return np.infty def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): diff --git a/test/test_generate_knockoff.py b/test/test_generate_knockoff.py index 231e8486..570e6a99 100644 --- a/test/test_generate_knockoff.py +++ b/test/test_generate_knockoff.py @@ -9,7 +9,7 @@ import pytest -def test_estimate_distribution_wolf(): +def test_estimate_distribution_ledoit_wolf(): seed = 42 n = 100 p = 50 From 85a6026f1c6d1e2a7d03a2388bcf39530902a189 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 21 Jan 2025 08:58:41 +0100 Subject: [PATCH 26/26] remove file setup --- src/hidimstat/setup.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 src/hidimstat/setup.py diff --git a/src/hidimstat/setup.py b/src/hidimstat/setup.py deleted file mode 100644 index 15c6389d..00000000 --- a/src/hidimstat/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -def configuration(parent_package="", top_path=None): - from numpy.distutils.misc_util import Configuration - - config = Configuration("plotting", parent_package, top_path) - - config.add_subpackage("tests") - - return config - - -if __name__ == "__main__": - from numpy.distutils.core import setup - - setup(**configuration(top_path="").todict())