diff --git a/doc_conf/api.rst b/doc_conf/api.rst index 885873f3..13937444 100644 --- a/doc_conf/api.rst +++ b/doc_conf/api.rst @@ -26,7 +26,8 @@ Functions knockoff_aggregation model_x_knockoff multivariate_1D_simulation - permutation_test_cv + permutation_test + permutation_test_pval reid standardized_svr zscore_from_pval diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 4b8a42a4..a7182329 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -179,6 +179,24 @@ @article{liuFastPowerfulConditional2021 file = {/home/ahmad/Zotero/storage/8HRQZX3H/Liu et al. - 2021 - Fast and Powerful Conditional Randomization Testin.pdf;/home/ahmad/Zotero/storage/YFNDKN2B/2006.html} } +@book{westfall1993resampling, + title={Resampling-based multiple testing: Examples and methods for p-value adjustment}, + author={Westfall, Peter H and Young, S Stanley}, + volume={279}, + year={1993}, + publisher={John Wiley \& Sons} +} + +@article{hirschhorn2005genome, + title={Genome-wide association studies for common diseases and complex traits}, + author={Hirschhorn, Joel N and Daly, Mark J}, + journal={Nature reviews genetics}, + volume={6}, + number={2}, + pages={95--108}, + year={2005}, + publisher={Nature Publishing Group UK London} +} @article{gaonkar_deriving_2012, title = {Deriving statistical significance maps for {SVM} based image classification and group comparisons}, volume = {15}, @@ -192,4 +210,4 @@ @article{gaonkar_deriving_2012 year = {2012}, pmid = {23285616}, pmcid = {PMC3703958}, -} \ No newline at end of file +} diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index dbe14c07..92c6936c 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -52,15 +52,18 @@ from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image from sklearn.linear_model import Ridge +from sklearn.svm import LinearSVR from sklearn.utils import Bunch from hidimstat.ada_svr import ada_svr from hidimstat.clustered_inference import clustered_inference from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference -from hidimstat.permutation_test import permutation_test, permutation_test_cv +from hidimstat.permutation_test import permutation_test, permutation_test_pval from hidimstat.standardized_svr import standardized_svr from hidimstat.stat_tools import pval_from_scale, zscore_from_pval +n_job = None + ############################################################################# # Function to fetch and preprocess Haxby dataset @@ -151,19 +154,28 @@ def preprocess_haxby(subject=2, memory=None): SVR_permutation_test_inference = False if SVR_permutation_test_inference: - # We computed the regularization parameter by CV (C = 0.1) - pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = permutation_test_cv( - X, y, n_permutations=50, C=0.1 + # It will be better to associate cross validation with the estimator + # but for a sake of time, this is not done. + estimator = LinearSVR() + weight_svr, weight_svr_distribution = permutation_test( + X, y, estimator, n_permutations=50 + ) + pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = permutation_test_pval( + weight_svr, weight_svr_distribution ) # Another method is to compute the p-values by permutation test from the # Ridge decoder. The solution provided by this method should be very close to # the previous one and the computation time is much shorter: around 20 seconds. - +# We computed the parameter from a cross valisation (alpha = 0.0215) +# It will be better to use RidgeCV but for a sake of time, this is not done. estimator = Ridge() -pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = permutation_test( +weight_ridge, weight_ridge_distribution = permutation_test( X, y, estimator=estimator, n_permutations=200 ) +pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = permutation_test_pval( + weight_ridge, weight_ridge_distribution +) ############################################################################# # Now, let us run the algorithm introduced by Gaonkar et al. (c.f. References). @@ -305,4 +317,4 @@ def plot_map( # (EnCluDL) seems realistic as we recover the visual cortex and do not make # spurious discoveries. -show() +# show() diff --git a/pyproject.toml b/pyproject.toml index 1dde446f..ed7b0765 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,4 +80,8 @@ where = ["src"] [tool.hatch.version] -source = "vcs" \ No newline at end of file +source = "vcs" + +#pyproject.toml +[tool.pytest.ini_options] +addopts = "--ignore=src" # ignore src directory \ No newline at end of file diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 24e0321b..a05592cf 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -7,7 +7,7 @@ from .knockoffs import model_x_knockoff from .multi_sample_split import aggregate_quantiles from .noise_std import group_reid, reid -from .permutation_test import permutation_test_cv +from .permutation_test import permutation_test, permutation_test_pval from .scenario import multivariate_1D_simulation from .standardized_svr import standardized_svr from .stat_tools import zscore_from_pval @@ -34,7 +34,8 @@ "knockoff_aggregation", "model_x_knockoff", "multivariate_1D_simulation", - "permutation_test_cv", + "permutation_test", + "permutation_test_pval", "reid", "standardized_svr", "zscore_from_pval", diff --git a/src/hidimstat/permutation_test.py b/src/hidimstat/permutation_test.py index ed38a9c3..7f218488 100644 --- a/src/hidimstat/permutation_test.py +++ b/src/hidimstat/permutation_test.py @@ -1,25 +1,20 @@ import numpy as np from joblib import Parallel, delayed from sklearn.base import clone -from sklearn.model_selection import GridSearchCV -from sklearn.pipeline import Pipeline -from sklearn.svm import LinearSVR -from sklearn.utils import _safe_indexing - -from hidimstat.stat_tools import pval_from_two_sided_pval_and_sign - - -def permutation_test_cv( - X, - y, - n_permutations=1000, - C=None, - Cs=np.logspace(-7, 1, 9), - seed=0, - n_jobs=1, - verbose=1, + +from hidimstat.stat_tools import pval_from_two_sided_pval_and_sign, step_down_max_t + +__all__ = ["permutation_test", "permutation_test_pval"] + + +def permutation_test( + X, y, estimator, n_permutations=1000, seed=0, n_jobs=None, verbose=0 ): - """Cross-validated permutation test shuffling the target + """ + Permutation test + + This function compute the distribution of the weights of a linear model + by shuffling the target :footcite:t:`hirschhorn2005genome`. Parameters ---------- @@ -29,16 +24,8 @@ def permutation_test_cv( y : ndarray, shape (n_samples,) Target. - C : float or None, optional (default=None) - If None, the linear SVR regularization parameter is set by cross-val - running a grid search on the list of hyper-parameters contained in Cs. - Otherwise, the regularization parameter is equal to C. - The strength of the regularization is inversely proportional to C. - - Cs : ndarray, optional (default=np.logspace(-7, 1, 9)) - If C is None, the linear SVR regularization parameter is set by - cross-val running a grid search on the list of hyper-parameters - contained in Cs. + estimator : object LinearModel + The linear model used to fit the data. n_permutations : int, optional (default=1000) Number of permutations used to compute the survival function @@ -47,77 +34,58 @@ def permutation_test_cv( seed : int, optional (default=0) Determines the permutations used for shuffling the target - n_jobs : int or None, optional (default=1) + n_jobs : int or None, optional (default=None) Number of CPUs to use during the cross validation. - verbose: int, optional (default=1) - The verbosity level: if non zero, progress messages are printed - when computing the permutation stats in parralel. - The frequency of the messages increases with the verbosity level. + verbose : int, optional (default=0) + The verbosity level of the joblib.Parallel. Returns ------- - pval_corr : ndarray, shape (n_features,) - p-value corrected for multiple testing, with numerically accurate - values for positive effects (ie., for p-value close to zero). + weights : ndarray, shape (n_features,) + The weights of the original model. - one_minus_pval_corr : ndarray, shape (n_features,) - One minus the corrected p-value, with numerically accurate - values for negative effects (ie., for p-value close to one). - """ + weights_distribution : ndarray, shape (n_permutations, n_features) + The distribution of the weights of the model obtained by shuffling + the target n_permutations times. - if C is None: + References + ---------- + .. footbibliography:: - steps = [("SVR", LinearSVR())] - pipeline = Pipeline(steps) - parameters = {"SVR__C": Cs} - grid = GridSearchCV(pipeline, param_grid=parameters, n_jobs=n_jobs) - grid.fit(X, y) - C = grid.best_params_["SVR__C"] - estimator = LinearSVR(C=C) + """ - else: + rng = np.random.default_rng(seed) - estimator = LinearSVR(C=C) + # Get the weights of the original model + if not hasattr(estimator, "coef_"): + weights = _fit_and_weights(estimator, X, y) + else: + weights = estimator.coef_ - pval_corr, one_minus_pval_corr = permutation_test( - X, - y, - estimator, - n_permutations=n_permutations, - seed=seed, - n_jobs=n_jobs, - verbose=verbose, + # Get the distribution of the weights by shuffling the target + weights_distribution = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(_fit_and_weights)(clone(estimator), X, _shuffle_vector(y, rng)) + for _ in range(n_permutations) ) - return pval_corr, one_minus_pval_corr + # Convert the list of weights into an array + weights_distribution = np.array(weights_distribution) + return weights, weights_distribution -def permutation_test(X, y, estimator, n_permutations=1000, seed=0, n_jobs=1, verbose=1): - """Permutation test shuffling the target + +def permutation_test_pval(weights, weights_distribution): + """ + Compute p-value from permutation test Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Data. + weights : ndarray, shape (n_features,) + The weights of the original model. - y : ndarray, shape (n_samples,) - Target. - - n_permutations : int, optional (default=1000) - Number of permutations used to compute the survival function - and cumulative distribution function scores. - - seed : int, optional (default=0) - Determines the permutations used for shuffling the target - - n_jobs : int or None, optional (default=1) - Number of CPUs to use during the cross validation. - - verbose: int, optional (default=1) - The verbosity level: if non zero, progress messages are printed - when computing the permutation stats in parralel. - The frequency of the messages increases with the verbosity level. + weights_distribution : ndarray, shape (n_permutations, n_features) + The distribution of the weights of the model obtained by shuffling Returns ------- @@ -129,20 +97,9 @@ def permutation_test(X, y, estimator, n_permutations=1000, seed=0, n_jobs=1, ver One minus the corrected p-value, with numerically accurate values for negative effects (ie., for p-value close to one). """ + two_sided_pval_corr = step_down_max_t(weights, weights_distribution) - rng = np.random.default_rng(seed) - - stat = _permutation_test_stat(clone(estimator), X, y) - - permutation_stats = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_permutation_test_stat)(clone(estimator), X, _shuffle(y, rng)) - for _ in range(n_permutations) - ) - - permutation_stats = np.array(permutation_stats) - two_sided_pval_corr = step_down_max_T(stat, permutation_stats) - - stat_sign = np.sign(stat) + stat_sign = np.sign(weights) pval_corr, _, one_minus_pval_corr, _ = pval_from_two_sided_pval_and_sign( two_sided_pval_corr, stat_sign @@ -151,65 +108,47 @@ def permutation_test(X, y, estimator, n_permutations=1000, seed=0, n_jobs=1, ver return pval_corr, one_minus_pval_corr -def _permutation_test_stat(estimator, X, y): - """Fit estimator and get coef""" - stat = estimator.fit(X, y).coef_ - return stat - - -def _shuffle(y, rng): - """Shuffle vector""" - indices = rng.permutation(len(y)) - return _safe_indexing(y, indices) - - -def step_down_max_T(stat, permutation_stats): - """Step-down maxT algorithm for computing adjusted p-values +def _fit_and_weights(estimator, X, y): + """ + Fit the estimator and return the weights Parameters ---------- - stat : ndarray, shape (n_features,) - Statistic computed on the original (unpermutted) problem. + estimator : object + The estimator to fit. + + X : ndarray, shape (n_samples, n_features) + Data. - permutation_stats : ndarray, shape (n_permutations, n_features) - Statistics computed on permutted problems. + y : ndarray, shape (n_samples,) + Target. Returns ------- - two_sided_pval_corr : ndarray, shape (n_features,) - Two-sided p-values corrected for multiple testing. - - References - ---------- - .. [1] Westfall, P. H., & Young, S. S. (1993). Resampling-based multiple - testing: Examples and methods for p-value adjustment (Vol. 279). - John Wiley & Sons. + weights : ndarray, shape (n_features,) + The weights of the estimator. """ + weights = estimator.fit(X, y).coef_ + return weights - n_permutations, n_features = np.shape(permutation_stats) - - index_ordered = np.argsort(np.abs(stat)) - stat_ranked = np.empty(n_features) - stat_ranked[index_ordered] = np.arange(n_features) - stat_ranked = stat_ranked.astype(int) - stat_sorted = np.copy(np.abs(stat)[index_ordered]) - permutation_stats_ordered = np.copy(np.abs(permutation_stats)[:, index_ordered]) - for i in range(1, n_features): - permutation_stats_ordered[:, i] = np.maximum( - permutation_stats_ordered[:, i - 1], permutation_stats_ordered[:, i] - ) - - two_sided_pval_corr = ( - np.sum(np.less_equal(stat_sorted, permutation_stats_ordered), axis=0) - / n_permutations - ) +def _shuffle_vector(y, rng): + """ + Shuffle the target - for i in range(n_features - 1)[::-1]: - two_sided_pval_corr[i] = np.maximum( - two_sided_pval_corr[i], two_sided_pval_corr[i + 1] - ) + Parameters + ---------- + y : ndarray, shape (n_samples,) + Target. - two_sided_pval_corr = np.copy(two_sided_pval_corr[stat_ranked]) + rng : numpy.random.Generator + Random number generator. - return two_sided_pval_corr + Returns + ------- + y_shuffled : ndarray, shape (n_samples,) + Shuffled target. + """ + y_copy = np.copy(y) + rng.shuffle(y_copy) + return y_copy diff --git a/src/hidimstat/stat_tools.py b/src/hidimstat/stat_tools.py index 0ced13e9..2603c65c 100644 --- a/src/hidimstat/stat_tools.py +++ b/src/hidimstat/stat_tools.py @@ -390,3 +390,73 @@ def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib="norm"): ) return two_sided_pval, two_sided_pval_corr + + +def step_down_max_t(stat, permutation_stats): + """ + Step-down maxT multiple testing procedure + + | This algorithm for computing adjusted p-values :footcite:t:`westfall1993resampling`. + | It assumes that the test statistics is centered around zero. + | This algorithm controls the family-wise error rate (FWER) by: + | 1. Ordering test statistics by absolute value + | 2. Computing successively larger null distributions + | 3. Adjusting p-values using step-down procedure + + Parameters + ---------- + stat : ndarray, shape (n_features,) + Statistic computed on the original (unpermuted) problem. + + permutation_stats : ndarray, shape (n_permutations, n_features) + Statistics computed on permuted problems. + + Returns + ------- + two_sided_pval_corr : ndarray, shape (n_features,) + Two-sided p-values corrected for multiple testing. + + References + ---------- + .. footbibliography:: + """ + + n_permutations, n_features = np.shape(permutation_stats) + + # Step 1: Order features by absolute value of test statistics + # Keep track of original positions + index_ordered = np.argsort(np.abs(stat)) + stat_ranked = np.empty(n_features, dtype=int) + stat_ranked[index_ordered] = np.arange(n_features) + # Sorted absolute statistics + stat_sorted = np.copy(np.abs(stat)[index_ordered]) + # Order permutation stats similarly + permutation_stats_ordered = np.copy(np.abs(permutation_stats)[:, index_ordered]) + + # Step 2: Update null distribution + # For each column i, take the maximum between current and previous column + # This creates successively larger null distributions + for i in range(1, n_features): + permutation_stats_ordered[:, i] = np.maximum( + permutation_stats_ordered[:, i - 1], permutation_stats_ordered[:, i] + ) + + # Step 3: Compute raw adjusted p-values + # Count how many permutation statistics are >= than observed statistics + two_sided_pval_corr = ( + np.sum(np.less_equal(stat_sorted, permutation_stats_ordered), axis=0) + / n_permutations + ) + + # Step 4: Enforce monotonicity + # Ensure that p-values are monotonically decreasing + # by taking maximum of current and next p-value + for i in range(n_features - 1)[::-1]: + two_sided_pval_corr[i] = np.maximum( + two_sided_pval_corr[i], two_sided_pval_corr[i + 1] + ) + + # Step 5: Rearrange p-values back to original feature order + two_sided_pval_corr = np.copy(two_sided_pval_corr[stat_ranked]) + + return two_sided_pval_corr diff --git a/test/test_permutation_test.py b/test/test_permutation_test.py index dc94fecf..b8c9c41d 100644 --- a/test/test_permutation_test.py +++ b/test/test_permutation_test.py @@ -4,70 +4,151 @@ import numpy as np from numpy.testing import assert_almost_equal +from sklearn.svm import LinearSVR +from sklearn.base import clone -from hidimstat.permutation_test import permutation_test_cv +from hidimstat.permutation_test import permutation_test, permutation_test_pval from hidimstat.scenario import multivariate_1D_simulation def test_permutation_test(): - """Testing the procedure on a simulation with no structure and a support + """ + Testing the procedure on a simulation with no structure and a support of size 1. Computing one-sided p-values, we want a low p-value - for the first feature and p-values close to 0.5 for the others.""" + for the first feature and p-values close to 0.5 for the others. + """ + # parameters for generating the data n_samples, n_features = 20, 50 + n_permutations = 100 support_size = 1 - sigma = 0.1 - rho = 0.0 X_init, y, beta, noise = multivariate_1D_simulation( n_samples=n_samples, n_features=n_features, support_size=support_size, - sigma=sigma, - rho=rho, + sigma=0.1, + rho=0.0, shuffle=False, seed=3, ) - y = y - np.mean(y) - X_init = X_init - np.mean(X_init, axis=0) + # create a linear SVR estimator + estimator = LinearSVR(C=1.0) - pval_corr, one_minus_pval_corr = permutation_test_cv(X_init, y, n_permutations=100) + # run the permutation test + weight, weight_distribution = permutation_test( + X_init, y, estimator=estimator, n_permutations=n_permutations + ) + # test the shape of the output + assert weight.shape == (n_features,) + assert weight_distribution.shape == (n_permutations, n_features) + # test weights + assert not np.all(weight == 0.0) + # test the distribution of the weights + assert not np.all(weight_distribution == 0.0) + assert not np.all(weight_distribution == weight) + for i in range(n_permutations - 1): + assert not np.all(weight_distribution[i, :] == weight_distribution[i + 1, :]) + + # compute the p-values + pval_corr, _ = permutation_test_pval(weight, weight_distribution) + + # expected p-values expected = 0.5 * np.ones(n_features) expected[:support_size] = 0.0 + # test assert_almost_equal(pval_corr, expected, decimal=1) -def test_permutation_test_with_SVR(): - """Testing the procedure on a simulation with no structure and a support - of size 1. Computing one-sided p-values, we want a low p-value - for the first feature and p-values close to 0.5 for the others.""" +def test_permutation_test_with_fitted_estimator(): + """ + Testing if the weight of a fitted estimator is the same that the one obtained + from the function _fit_and_weights. + """ + # parameters for generating the data n_samples, n_features = 20, 50 + n_permutations = 100 support_size = 1 - sigma = 0.1 - rho = 0.0 + seed_permutation_test = 12 X_init, y, beta, noise = multivariate_1D_simulation( n_samples=n_samples, n_features=n_features, support_size=support_size, - sigma=sigma, - rho=rho, + sigma=0.1, + rho=0.0, shuffle=False, seed=3, ) - y = y - np.mean(y) - X_init = X_init - np.mean(X_init, axis=0) + # create a linear SVR estimator + estimator = LinearSVR(C=1.0, random_state=0) + estimator_fitted = clone(estimator).fit(X_init, y) + + # run the permutation test + weight, weight_distribution = permutation_test( + X_init, + y, + estimator=estimator, + n_permutations=n_permutations, + seed=seed_permutation_test, + ) - pval_corr, one_minus_pval_corr = permutation_test_cv( - X_init, y, n_permutations=100, C=0.1 + # run the permutation test with the fitted estimator + weight_fitted, weight_distribution_fitted = permutation_test( + X_init, + y, + estimator=estimator_fitted, + n_permutations=n_permutations, + seed=seed_permutation_test, ) - expected = 0.5 * np.ones(n_features) - expected[:support_size] = 0.0 + # test the weights + assert np.all(weight == weight_fitted) + # test the distribution of the weights + assert np.all(weight_distribution == weight_distribution_fitted) - assert_almost_equal(pval_corr, expected, decimal=1) + +def test_permutation_test_with_fitted_seed(): + """ + Testing if the weight of a fitted estimator is the same that the one obtained + from the function _fit_and_weights. + """ + + # parameters for generating the data + n_samples, n_features = 20, 50 + n_permutations = 100 + support_size = 1 + + X_init, y, beta, noise = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + sigma=0.1, + rho=0.0, + shuffle=False, + seed=3, + ) + + # create a linear SVR estimator + estimator = LinearSVR(C=1.0, random_state=0) + estimator_clone = clone(estimator) + + # run the permutation test + weight, weight_distribution = permutation_test( + X_init, y, estimator=estimator, n_permutations=n_permutations, seed=42 + ) + + # run the permutation test with the fitted estimator + weight_seed_diff, weight_distribution_seed_diff = permutation_test( + X_init, y, estimator=estimator_clone, n_permutations=n_permutations, seed=10 + ) + + # test the weights + assert np.all(weight == weight_seed_diff) + # test the distribution of the weights + assert np.all(weight_distribution != weight_distribution_seed_diff)