diff --git a/doc_conf/api.rst b/doc_conf/api.rst index 84be6a26..8b2b5e39 100644 --- a/doc_conf/api.rst +++ b/doc_conf/api.rst @@ -28,7 +28,7 @@ Functions permutation_test permutation_test_pval reid - standardized_svr + empirical_thresholding zscore_from_pval Classes diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 7296f8b9..9de772a0 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -178,6 +178,18 @@ @article{liuFastPowerfulConditional2021 keywords = {Statistics - Methodology}, } +@thesis{chevalier_statistical_2020, + title = {Statistical control of sparse models in high dimension}, + url = {https://theses.hal.science/tel-03147200}, + institution = {Université Paris-Saclay}, + type = {phdthesis}, + author = {Chevalier, Jérôme-Alexis}, + urldate = {2024-10-17}, + date = {2020-12-11}, + langid = {english}, +} +} + @article{benjamini1995controlling, title={Controlling the false discovery rate: a practical and powerful approach to multiple testing}, author={Benjamini, Yoav and Hochberg, Yosef}, diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 68c696a1..ef112606 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -58,8 +58,8 @@ from hidimstat.adaptative_permutation_threshold_SVR import ada_svr from hidimstat.clustered_inference import clustered_inference from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference +from hidimstat.empirical_thresholding import empirical_thresholding from hidimstat.permutation_test import permutation_test, permutation_test_pval -from hidimstat.standardized_svr import standardized_svr from hidimstat.stat_tools import pval_from_scale, zscore_from_pval n_job = None @@ -139,10 +139,11 @@ def preprocess_haxby(subject=2, memory=None): # First, we try to recover the discriminative partern by computing # p-values from SVR decoder weights and a parametric approximation # of the distribution of these weights. - -# We precomputed the regularization parameter by CV (C = 0.1) to reduce the -# computation time of the example. -beta_hat, scale = standardized_svr(X, y, Cs=[0.1]) +beta_hat, scale = empirical_thresholding( + X, + y, + linear_estimator=LinearSVR(), +) pval_std_svr, _, one_minus_pval_std_svr, _ = pval_from_scale(beta_hat, scale) ############################################################################# @@ -317,4 +318,4 @@ def plot_map( # (EnCluDL) seems realistic as we recover the visual cortex and do not make # spurious discoveries. -# show() +show() diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 333379db..f8114696 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -8,7 +8,7 @@ from .noise_std import group_reid, reid from .permutation_test import permutation_test, permutation_test_pval from .scenario import multivariate_1D_simulation -from .standardized_svr import standardized_svr +from .empirical_thresholding import empirical_thresholding from .stat_tools import zscore_from_pval from .cpi import CPI from .loco import LOCO @@ -35,7 +35,7 @@ "permutation_test", "permutation_test_pval", "reid", - "standardized_svr", + "empirical_thresholding", "zscore_from_pval", "CPI", "LOCO", diff --git a/src/hidimstat/empirical_thresholding.py b/src/hidimstat/empirical_thresholding.py new file mode 100644 index 00000000..776c55b8 --- /dev/null +++ b/src/hidimstat/empirical_thresholding.py @@ -0,0 +1,72 @@ +import numpy as np +from numpy.linalg import norm +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVR + + +def empirical_thresholding( + X, + y, + linear_estimator=GridSearchCV( + LinearSVR(), param_grid={"C": np.logspace(-7, 1, 9)}, n_jobs=None + ), +): + """ + Perform empirical thresholding on the input data and target using a linear + estimator. + + This function fits a linear estimator to the input data and target, + and then uses the estimated coefficients to perform empirical thresholding. + The threshold is calculated for keeping only extreme coefficients. + For more details, see the section 6.3.2 of :cite:`chevalier_statistical_2020` + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + The input data. + y : ndarray, shape (n_samples,) + The target values. + linear_estimator : estimator object, optional (default=GridSearchCV( + LinearSVR(),param_grid={"C": np.logspace(-7, 1, 9)}, n_jobs=None)) + The linear estimator to use for thresholding. It should be a scikit-learn + estimator object that implements the `fit` method and has a `coef_` + attribute or a `best_estimator_` attribute with a `coef_` attribute + (e.g., a `GridSearchCV` object). + + Returns + ------- + beta_hat : ndarray, shape (n_features,) + The estimated coefficients of the linear estimator. + scale : ndarray, shape (n_features,) + The threshold values for each feature. + + Raises + ------ + ValueError + If the `linear_estimator` does not have a `coef_` attribute + or a `best_estimator_` attribute with a `coef_` attribute. + + Notes + ----- + The threshold is calculated as the standard deviation of the estimated + coefficients multiplied by the square root of the number of features. + This is based on the assumption that the coefficients follow a normal + distribution with mean zero. + """ + _, n_features = X.shape + + linear_estimator.fit(X, y) + + if hasattr(linear_estimator, "coef_"): + beta_hat = linear_estimator.coef_ + elif hasattr(linear_estimator, "best_estimator_") and hasattr( + linear_estimator.best_estimator_, "coef_" + ): + beta_hat = linear_estimator.best_estimator_.coef_ # for CV object + else: + raise ValueError("linear estimator should be linear.") + + std = norm(beta_hat) / np.sqrt(n_features) + scale = std * np.ones(beta_hat.size) + + return beta_hat, scale diff --git a/src/hidimstat/standardized_svr.py b/src/hidimstat/standardized_svr.py deleted file mode 100644 index 183b36d6..00000000 --- a/src/hidimstat/standardized_svr.py +++ /dev/null @@ -1,49 +0,0 @@ -import numpy as np -from numpy.linalg import norm -from sklearn.model_selection import GridSearchCV -from sklearn.pipeline import Pipeline -from sklearn.svm import LinearSVR - - -def standardized_svr(X, y, Cs=np.logspace(-7, 1, 9), n_jobs=1): - """Cross-validated SVR - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Data. - - y : ndarray, shape (n_samples,) - Target. - - Cs : ndarray, optional (default=np.logspace(-7, 1, 9)) - The linear SVR regularization parameter is set by cross-val running - a grid search on the list of hyper-parameters contained in Cs. - - n_jobs : int or None, optional (default=1) - Number of CPUs to use during the cross validation. - - Returns - ------- - beta_hat : array, shape (n_features,) - Estimated parameter vector. - - scale : ndarray, shape (n_features,) - Value of the standard deviation of the parameters. - """ - - n_samples, n_features = X.shape - - steps = [("SVR", LinearSVR())] - pipeline = Pipeline(steps) - parameters = {"SVR__C": Cs} - - grid = GridSearchCV(pipeline, param_grid=parameters, n_jobs=n_jobs) - grid.fit(X, y) - - beta_hat = grid.best_estimator_.named_steps["SVR"].coef_ - - std = norm(beta_hat) / np.sqrt(n_features) - scale = std * np.ones(beta_hat.size) - - return beta_hat, scale diff --git a/test/test_empirical_thresholding.py b/test/test_empirical_thresholding.py new file mode 100644 index 00000000..d6a504c4 --- /dev/null +++ b/test/test_empirical_thresholding.py @@ -0,0 +1,78 @@ +""" +Test the empirical thresholding module +""" + +import pytest +import numpy as np +from numpy.testing import assert_almost_equal + +from hidimstat.scenario import multivariate_1D_simulation +from hidimstat.empirical_thresholding import empirical_thresholding +from hidimstat.stat_tools import pval_from_scale + +from sklearn.linear_model import Lasso +from sklearn.tree import DecisionTreeRegressor + + +def test_emperical_thresholding(): + """Testing the procedure on a simulation with no structure and a support + of size 1. Computing one-sided p-values, we want a low p-value + for the first feature and p-values close to 0.5 for the others.""" + + n_samples, n_features = 20, 50 + support_size = 1 + sigma = 0.1 + rho = 0.0 + + X_init, y, beta, noise = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + sigma=sigma, + rho=rho, + shuffle=False, + seed=3, + ) + + beta_hat, scale_hat = empirical_thresholding(X_init, y) + + pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) + + expected = 0.5 * np.ones(n_features) + expected[:support_size] = 0.0 + + assert_almost_equal(pval_corr, expected, decimal=1) + + +def test_emperical_thresholding_lasso(): + """Testing the procedure on a simulation with no structure and a support + of size 1 with lasso.""" + + n_samples, n_features = 20, 50 + support_size = 1 + sigma = 0.1 + rho = 0.0 + + X_init, y, beta, noise = multivariate_1D_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + sigma=sigma, + rho=rho, + shuffle=False, + seed=3, + ) + + with pytest.raises(ValueError, match="linear estimator should be linear."): + beta_hat, scale_hat = empirical_thresholding( + X_init, y, linear_estimator=DecisionTreeRegressor() + ) + + beta_hat, scale_hat = empirical_thresholding(X_init, y, linear_estimator=Lasso()) + + pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) + + expected = 0.5 * np.ones(n_features) + expected[:support_size] = 0.0 + + assert_almost_equal(pval_corr, expected, decimal=1) diff --git a/test/test_standardized_svr.py b/test/test_standardized_svr.py deleted file mode 100644 index a6665c59..00000000 --- a/test/test_standardized_svr.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Test the standardized_svr module -""" - -import numpy as np -from numpy.testing import assert_almost_equal - -from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.standardized_svr import standardized_svr -from hidimstat.stat_tools import pval_from_scale - - -def test_standardized_svr(): - """Testing the procedure on a simulation with no structure and a support - of size 1. Computing one-sided p-values, we want a low p-value - for the first feature and p-values close to 0.5 for the others.""" - - n_samples, n_features = 20, 50 - support_size = 1 - sigma = 0.1 - rho = 0.0 - - X_init, y, beta, noise = multivariate_1D_simulation( - n_samples=n_samples, - n_features=n_features, - support_size=support_size, - sigma=sigma, - rho=rho, - shuffle=False, - seed=3, - ) - - y = y - np.mean(y) - X_init = X_init - np.mean(X_init, axis=0) - - beta_hat, scale_hat = standardized_svr(X_init, y) - - pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) - - expected = 0.5 * np.ones(n_features) - expected[:support_size] = 0.0 - - assert_almost_equal(pval_corr, expected, decimal=1)