Merge branch 'main' into PR_estimation_threshold

mind-inria · Feb 18, 2025 · a647319 · a647319
2 parents 8e9db7e + 095dacd
commit a647319
Show file tree

Hide file tree

Showing 93 changed files with 1,625 additions and 651 deletions.
diff --git a/doc_conf/api.rst b/doc_conf/api.rst
@@ -26,7 +26,8 @@ Functions
    knockoff_aggregation
    model_x_knockoff
    multivariate_1D_simulation
-   permutation_test_cv
+   permutation_test
+   permutation_test_pval
    reid
    empirical_thresholding
    zscore_from_pval

diff --git a/doc_conf/index.rst b/doc_conf/index.rst
@@ -37,6 +37,9 @@ HiDimStat depends on the following packages::
   numpy
   scipy
   scikit-learn
+  pandas
+  torch
+  torchmetrics
 
 
 To run examples it is neccessary to install ``matplotlib``, and to run tests it
@@ -71,8 +74,8 @@ To build the documentation you will need to run:
 
 .. code-block::
 
-    pip install -U sphinx_gallery sphinx_bootstrap_theme
-    cd doc
+    pip install -U .[doc]
+    cd doc_conf
     make html
 
 

diff --git a/doc_conf/references.bib b/doc_conf/references.bib
@@ -176,7 +176,6 @@ @article{liuFastPowerfulConditional2021
   abstract = {We consider the problem of conditional independence testing: given a response Y and covariates (X,Z), we test the null hypothesis that Y is independent of X given Z. The conditional randomization test (CRT) was recently proposed as a way to use distributional information about X{\textbar}Z to exactly (non-asymptotically) control Type-I error using any test statistic in any dimensionality without assuming anything about Y{\textbar}(X,Z). This flexibility in principle allows one to derive powerful test statistics from complex prediction algorithms while maintaining statistical validity. Yet the direct use of such advanced test statistics in the CRT is prohibitively computationally expensive, especially with multiple testing, due to the CRT's requirement to recompute the test statistic many times on resampled data. We propose the distilled CRT, a novel approach to using state-of-the-art machine learning algorithms in the CRT while drastically reducing the number of times those algorithms need to be run, thereby taking advantage of their power and the CRT's statistical guarantees without suffering the usual computational expense. In addition to distillation, we propose a number of other tricks like screening and recycling computations to further speed up the CRT without sacrificing its high power and exact validity. Indeed, we show in simulations that all our proposals combined lead to a test that has similar power to the most powerful existing CRT implementations but requires orders of magnitude less computation, making it a practical tool even for large data sets. We demonstrate these benefits on a breast cancer dataset by identifying biomarkers related to cancer stage.},
   archiveprefix = {arxiv},
   keywords = {Statistics - Methodology},
-  file = {/home/ahmad/Zotero/storage/8HRQZX3H/Liu et al. - 2021 - Fast and Powerful Conditional Randomization Testin.pdf;/home/ahmad/Zotero/storage/YFNDKN2B/2006.html}
 }
 
 @thesis{chevalier_statistical_2020,
@@ -188,4 +187,101 @@ @thesis{chevalier_statistical_2020
 	urldate = {2024-10-17},
 	date = {2020-12-11},
 	langid = {english},
-}
+}
+}
+
+@article{benjamini1995controlling,
+  title={Controlling the false discovery rate: a practical and powerful approach to multiple testing},
+  author={Benjamini, Yoav and Hochberg, Yosef},
+  journal={Journal of the Royal statistical society: series B (Methodological)},
+  volume={57},
+  number={1},
+  pages={289--300},
+  year={1995},
+  publisher={Wiley Online Library}
+}
+
+
+@article{wang2022false,
+  title={False discovery rate control with e-values},
+  author={Wang, Ruodu and Ramdas, Aaditya},
+  journal={Journal of the Royal Statistical Society Series B: Statistical Methodology},
+  volume={84},
+  number={3},
+  pages={822--852},
+  year={2022},
+  publisher={Oxford University Press}
+}
+
+@article{ramdas2017wasserstein,
+  title={On wasserstein two-sample testing and related families of nonparametric tests},
+  author={Ramdas, Aaditya and Garc{\'\i}a Trillos, Nicol{\'a}s and Cuturi, Marco},
+  journal={Entropy},
+  volume={19},
+  number={2},
+  pages={47},
+  year={2017},
+  publisher={MDPI}
+}
+
+@article{ramdas2017online,
+  title={Online control of the false discovery rate with decaying memory},
+  author={Ramdas, Aaditya and Yang, Fanny and Wainwright, Martin J and Jordan, Michael I},
+  journal={Advances in neural information processing systems},
+  volume={30},
+  year={2017}
+}
+
+@article{meinshausen2008hierarchical,
+  title={Hierarchical testing of variable importance},
+  author={Meinshausen, Nicolai},
+  journal={Biometrika},
+  volume={95},
+  number={2},
+  pages={265--278},
+  year={2008},
+  publisher={Oxford University Press}
+}
+@article{meinshausen2009p,
+  title={P-values for high-dimensional regression},
+  author={Meinshausen, Nicolai and Meier, Lukas and B{\"u}hlmann, Peter},
+  journal={Journal of the American Statistical Association},
+  volume={104},
+  number={488},
+  pages={1671--1681},
+  year={2009},
+  publisher={Taylor \& Francis}
+}
+
+@book{westfall1993resampling,
+  title={Resampling-based multiple testing: Examples and methods for p-value adjustment},
+  author={Westfall, Peter H and Young, S Stanley},
+  volume={279},
+  year={1993},
+  publisher={John Wiley \& Sons}
+}
+
+@article{hirschhorn2005genome,
+  title={Genome-wide association studies for common diseases and complex traits},
+  author={Hirschhorn, Joel N and Daly, Mark J},
+  journal={Nature reviews genetics},
+  volume={6},
+  number={2},
+  pages={95--108},
+  year={2005},
+  publisher={Nature Publishing Group UK London}
+}
+@article{gaonkar_deriving_2012,
+	title = {Deriving statistical significance maps for {SVM} based image classification and group comparisons},
+	volume = {15},
+	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3703958/},
+	pages = {723--730},
+	number = {0},
+	journaltitle = {Medical image computing and computer-assisted intervention : {MICCAI} ... International Conference on Medical Image Computing and Computer-Assisted Intervention},
+	journal = {Med Image Comput Comput Assist Interv},
+	author = {Gaonkar, Bilwaj and Davatzikos, Christos},
+	urldate = {2024-12-16},
+	year = {2012},
+	pmid = {23285616},
+	pmcid = {PMC3703958},
+}
diff --git a/docs/_downloads/06bb0f682e15138dfd659b791a14e9c6/plot_variable_importance_classif.zip b/docs/_downloads/06bb0f682e15138dfd659b791a14e9c6/plot_variable_importance_classif.zip
diff --git a/docs/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip b/docs/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
diff --git a/docs/_downloads/117422582fb46cc8ed6549598a2d87de/plot_dcrt_example.zip b/docs/_downloads/117422582fb46cc8ed6549598a2d87de/plot_dcrt_example.zip
diff --git a/docs/_downloads/1a99dc8cb1c22f91072d67cf26fce26c/plot_knockoff_aggregation.zip b/docs/_downloads/1a99dc8cb1c22f91072d67cf26fce26c/plot_knockoff_aggregation.zip
diff --git a/docs/_downloads/5ca231767268e6cd969e65225d673650/plot_fmri_data_example.zip b/docs/_downloads/5ca231767268e6cd969e65225d673650/plot_fmri_data_example.zip
diff --git a/docs/_downloads/642b61154cca48af8e3feb505b920e16/plot_dcrt_example.ipynb b/docs/_downloads/642b61154cca48af8e3feb505b920e16/plot_dcrt_example.ipynb
@@ -78,7 +78,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.8"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,

diff --git a/docs/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip b/docs/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
diff --git a/..._downloads/707d94040f5ada342e781499193f46f1/plot_diabetes_variable_importance_example.zip b/..._downloads/707d94040f5ada342e781499193f46f1/plot_diabetes_variable_importance_example.zip
diff --git a/docs/_downloads/76c0979bf6618aa210fd11bb28dcf896/plot_fmri_data_example.py b/docs/_downloads/76c0979bf6618aa210fd11bb28dcf896/plot_fmri_data_example.py
@@ -52,15 +52,18 @@
 from sklearn.cluster import FeatureAgglomeration
 from sklearn.feature_extraction import image
 from sklearn.linear_model import Ridge
+from sklearn.svm import LinearSVR
 from sklearn.utils import Bunch
 
-from hidimstat.adaptive_permutation_threshold import ada_svr
+from hidimstat.ada_svr import ada_svr
 from hidimstat.clustered_inference import clustered_inference
 from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference
-from hidimstat.permutation_test import permutation_test, permutation_test_cv
+from hidimstat.permutation_test import permutation_test, permutation_test_pval
 from hidimstat.standardized_svr import standardized_svr
 from hidimstat.stat_tools import pval_from_scale, zscore_from_pval
 
+n_job = None
+
 
 #############################################################################
 # Function to fetch and preprocess Haxby dataset
@@ -151,19 +154,28 @@ def preprocess_haxby(subject=2, memory=None):
 
 SVR_permutation_test_inference = False
 if SVR_permutation_test_inference:
-    # We computed the regularization parameter by CV (C = 0.1)
-    pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = permutation_test_cv(
-        X, y, n_permutations=50, C=0.1
+    # It will be better to associate cross validation with the estimator
+    # but for a sake of time, this is not done.
+    estimator = LinearSVR()
+    weight_svr, weight_svr_distribution = permutation_test(
+        X, y, estimator, n_permutations=50
+    )
+    pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = permutation_test_pval(
+        weight_svr, weight_svr_distribution
     )
 
 # Another method is to compute the p-values by permutation test from the
 # Ridge decoder. The solution provided by this method should be very close to
 # the previous one and the computation time is much shorter: around 20 seconds.
-
+# We computed the parameter from a cross valisation (alpha = 0.0215)
+# It will be better to use RidgeCV but for a sake of time, this is not done.
 estimator = Ridge()
-pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = permutation_test(
+weight_ridge, weight_ridge_distribution = permutation_test(
     X, y, estimator=estimator, n_permutations=200
 )
+pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = permutation_test_pval(
+    weight_ridge, weight_ridge_distribution
+)
 
 #############################################################################
 # Now, let us run the algorithm introduced by Gaonkar et al. (c.f. References).
@@ -305,4 +317,4 @@ def plot_map(
 # (EnCluDL) seems realistic as we recover the visual cortex and do not make
 # spurious discoveries.
 
-show()
+# show()
diff --git a/docs/_downloads/7d2770a07fbe419760c9ac177df4f69e/plot_2D_simulation_example.ipynb b/docs/_downloads/7d2770a07fbe419760c9ac177df4f69e/plot_2D_simulation_example.ipynb
@@ -240,7 +240,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.8"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,

diff --git a/docs/_downloads/8635bd4b58b2828c710e4331f35d14f6/plot_knockoff_aggregation.ipynb b/docs/_downloads/8635bd4b58b2828c710e4331f35d14f6/plot_knockoff_aggregation.ipynb
@@ -42,7 +42,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.8"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,

diff --git a/docs/_downloads/931385a6992917f918857d6a3ee9f780/plot_fmri_data_example.ipynb b/docs/_downloads/931385a6992917f918857d6a3ee9f780/plot_fmri_data_example.ipynb
@@ -22,7 +22,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nimport pandas as pd\nfrom nilearn import datasets\nfrom nilearn.image import mean_img\nfrom nilearn.input_data import NiftiMasker\nfrom nilearn.plotting import plot_stat_map, show\nfrom sklearn.cluster import FeatureAgglomeration\nfrom sklearn.feature_extraction import image\nfrom sklearn.linear_model import Ridge\nfrom sklearn.utils import Bunch\n\nfrom hidimstat.adaptive_permutation_threshold import ada_svr\nfrom hidimstat.clustered_inference import clustered_inference\nfrom hidimstat.ensemble_clustered_inference import ensemble_clustered_inference\nfrom hidimstat.permutation_test import permutation_test, permutation_test_cv\nfrom hidimstat.standardized_svr import standardized_svr\nfrom hidimstat.stat_tools import pval_from_scale, zscore_from_pval"
+        "import numpy as np\nimport pandas as pd\nfrom nilearn import datasets\nfrom nilearn.image import mean_img\nfrom nilearn.input_data import NiftiMasker\nfrom nilearn.plotting import plot_stat_map, show\nfrom sklearn.cluster import FeatureAgglomeration\nfrom sklearn.feature_extraction import image\nfrom sklearn.linear_model import Ridge\nfrom sklearn.svm import LinearSVR\nfrom sklearn.utils import Bunch\n\nfrom hidimstat.ada_svr import ada_svr\nfrom hidimstat.clustered_inference import clustered_inference\nfrom hidimstat.ensemble_clustered_inference import ensemble_clustered_inference\nfrom hidimstat.permutation_test import permutation_test, permutation_test_pval\nfrom hidimstat.standardized_svr import standardized_svr\nfrom hidimstat.stat_tools import pval_from_scale, zscore_from_pval\n\nn_job = None"
       ]
     },
     {
@@ -119,7 +119,7 @@
       },
       "outputs": [],
       "source": [
-        "# To derive the p-values from the SVR decoder, you may change the next line by\n# `SVR_permutation_test_inference = True`. It should take around 15 minutes.\n\nSVR_permutation_test_inference = False\nif SVR_permutation_test_inference:\n    # We computed the regularization parameter by CV (C = 0.1)\n    pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = permutation_test_cv(\n        X, y, n_permutations=50, C=0.1\n    )\n\n# Another method is to compute the p-values by permutation test from the\n# Ridge decoder. The solution provided by this method should be very close to\n# the previous one and the computation time is much shorter: around 20 seconds.\n\nestimator = Ridge()\npval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = permutation_test(\n    X, y, estimator=estimator, n_permutations=200\n)"
+        "# To derive the p-values from the SVR decoder, you may change the next line by\n# `SVR_permutation_test_inference = True`. It should take around 15 minutes.\n\nSVR_permutation_test_inference = False\nif SVR_permutation_test_inference:\n    # It will be better to associate cross validation with the estimator\n    # but for a sake of time, this is not done.\n    estimator = LinearSVR()\n    weight_svr, weight_svr_distribution = permutation_test(\n        X, y, estimator, n_permutations=50\n    )\n    pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = permutation_test_pval(\n        weight_svr, weight_svr_distribution\n    )\n\n# Another method is to compute the p-values by permutation test from the\n# Ridge decoder. The solution provided by this method should be very close to\n# the previous one and the computation time is much shorter: around 20 seconds.\n# We computed the parameter from a cross valisation (alpha = 0.0215)\n# It will be better to use RidgeCV but for a sake of time, this is not done.\nestimator = Ridge()\nweight_ridge, weight_ridge_distribution = permutation_test(\n    X, y, estimator=estimator, n_permutations=200\n)\npval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = permutation_test_pval(\n    weight_ridge, weight_ridge_distribution\n)"
       ]
     },
     {
@@ -281,7 +281,7 @@
       },
       "outputs": [],
       "source": [
-        "show()"
+        "# show()"
       ]
     }
   ],
@@ -301,7 +301,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.8"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,

diff --git a/...ownloads/a70e28075a283d5e3fe675ced733c459/plot_diabetes_variable_importance_example.ipynb b/...ownloads/a70e28075a283d5e3fe675ced733c459/plot_diabetes_variable_importance_example.ipynb
@@ -186,7 +186,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.8"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,

diff --git a/docs/_downloads/c983a5ad5887eff1a3df617cbd64234a/plot_variable_importance_classif.ipynb b/docs/_downloads/c983a5ad5887eff1a3df617cbd64234a/plot_variable_importance_classif.ipynb
@@ -132,7 +132,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.12.8"
+      "version": "3.12.9"
     }
   },
   "nbformat": 4,

diff --git a/docs/_downloads/e08c0f6d4aade0f0eaf8ba56dbbfd9c9/plot_2D_simulation_example.zip b/docs/_downloads/e08c0f6d4aade0f0eaf8ba56dbbfd9c9/plot_2D_simulation_example.zip
diff --git a/docs/_sources/api.rst.txt b/docs/_sources/api.rst.txt
@@ -26,7 +26,8 @@ Functions
    knockoff_aggregation
    model_x_knockoff
    multivariate_1D_simulation
-   permutation_test_cv
+   permutation_test
+   permutation_test_pval
    reid
    standardized_svr
    zscore_from_pval

diff --git a/docs/_sources/auto_examples/plot_2D_simulation_example.rst.txt b/docs/_sources/auto_examples/plot_2D_simulation_example.rst.txt
@@ -554,9 +554,9 @@ randomization.
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** (1 minutes 5.243 seconds)
+   **Total running time of the script:** (1 minutes 5.593 seconds)
 
-**Estimated memory usage:**  701 MB
+**Estimated memory usage:**  722 MB
 
 
 .. _sphx_glr_download_auto_examples_plot_2D_simulation_example.py:

diff --git a/docs/_sources/auto_examples/plot_dcrt_example.rst.txt b/docs/_sources/auto_examples/plot_dcrt_example.rst.txt
@@ -162,9 +162,9 @@ Plotting the comparison
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** (1 minutes 2.810 seconds)
+   **Total running time of the script:** (1 minutes 2.729 seconds)
 
-**Estimated memory usage:**  638 MB
+**Estimated memory usage:**  658 MB
 
 
 .. _sphx_glr_download_auto_examples_plot_dcrt_example.py:

diff --git a/docs/_sources/auto_examples/plot_diabetes_variable_importance_example.rst.txt b/docs/_sources/auto_examples/plot_diabetes_variable_importance_example.rst.txt
@@ -491,9 +491,9 @@ Analyze the results
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** (0 minutes 8.765 seconds)
+   **Total running time of the script:** (0 minutes 8.186 seconds)
 
-**Estimated memory usage:**  623 MB
+**Estimated memory usage:**  643 MB
 
 
 .. _sphx_glr_download_auto_examples_plot_diabetes_variable_importance_example.py: