From e202b840227652c94f1002433aaa279c9568cb89 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:13:43 +0100 Subject: [PATCH 01/48] Remove unecesarry argument --- src/hidimstat/knockoffs.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 30eed4c6..6dfc0984 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -8,7 +8,6 @@ """ import numpy as np from sklearn.preprocessing import StandardScaler -from sklearn.utils.validation import check_memory from .gaussian_knockoff import _estimate_distribution, gaussian_knockoff_generation from .stat_coef_diff import _coef_diff_threshold, stat_coef_diff @@ -19,13 +18,11 @@ def model_x_knockoff( y, fdr=0.1, offset=1, - method="equi", statistics="lasso_cv", shrink=False, centered=True, cov_estimator="ledoit_wolf", verbose=False, - memory=None, n_jobs=1, seed=None, ): @@ -49,10 +46,6 @@ def model_x_knockoff( offset to calculate knockoff threshold, offset = 1 is equivalent to knockoff+ - method : str, optional - knockoff construction methods, either equi for equi-correlated knockoff - or sdp for optimization scheme - statistics : str, optional method to calculate knockoff test score @@ -86,17 +79,16 @@ def model_x_knockoff( ---------- .. footbibliography:: """ - memory = check_memory(memory) if centered: X = StandardScaler().fit_transform(X) - mu, Sigma = _estimate_distribution(X, shrink=shrink, cov_estimator=cov_estimator) + mu, sigma = _estimate_distribution(X, shrink=shrink, cov_estimator=cov_estimator) X_tilde = gaussian_knockoff_generation( - X, mu, Sigma, memory=memory, method=method, seed=seed + X, mu, sigma, seed=seed ) - test_score = memory.cache(stat_coef_diff, ignore=["n_jobs", "joblib_verbose"])( + test_score = stat_coef_diff( X, X_tilde, y, method=statistics, n_jobs=n_jobs ) thres = _coef_diff_threshold(test_score, fdr=fdr, offset=offset) From 82b829c0ccefce040aee5fbcbd798fa51ab95aee Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:15:12 +0100 Subject: [PATCH 02/48] Change the a bit the behavior --- src/hidimstat/stat_coef_diff.py | 44 ++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/hidimstat/stat_coef_diff.py b/src/hidimstat/stat_coef_diff.py index 3af1c3c3..a54a8f90 100644 --- a/src/hidimstat/stat_coef_diff.py +++ b/src/hidimstat/stat_coef_diff.py @@ -22,6 +22,7 @@ def stat_coef_diff( return_coef=False, solver="liblinear", seed=0, + tol=1e-8 ): """Calculate test statistic by doing estimation with Cross-validation on concatenated design matrix [X X_tilde] to find coefficients [beta @@ -58,6 +59,9 @@ def stat_coef_diff( return_coef : bool, optional return regression coefficient if set to True + + tol: float, optional + tolerance Returns ------- @@ -70,34 +74,39 @@ def stat_coef_diff( n_features = X.shape[1] X_ko = np.column_stack([X, X_tilde]) - lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) - lambdas = np.linspace(lambda_max * np.exp(-n_lambdas), lambda_max, n_lambdas) + if method == "lasso_cv": + lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) + lambdas = np.linspace(lambda_max * np.exp(-n_lambdas), lambda_max, n_lambdas) - cv = KFold(n_splits=5, shuffle=True, random_state=seed) + # check for replacing all of this by provided BaseSearchCV of scikit-learn + # this can help to generalize the methods and reduce the parameters of the functions + # the only problems can be lambdas???? + cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) estimator = { "lasso_cv": LassoCV( alphas=lambdas, n_jobs=n_jobs, verbose=joblib_verbose, - max_iter=int(1e4), + max_iter=n_iter, cv=cv, + tol=tol ), "logistic_l1": LogisticRegressionCV( penalty="l1", - max_iter=int(1e4), + max_iter=n_iter, solver=solver, cv=cv, n_jobs=n_jobs, - tol=1e-8, + tol=tol, ), "logistic_l2": LogisticRegressionCV( penalty="l2", - max_iter=int(1e4), + max_iter=n_iter, n_jobs=n_jobs, verbose=joblib_verbose, cv=cv, - tol=1e-8, + tol=tol, ), } @@ -131,24 +140,25 @@ def _coef_diff_threshold(test_score, fdr=0.1, offset=1): vector of test statistic fdr : float, optional - desired controlled FDR level + desired controlled FDR(false discovery rate) level offset : int, 0 or 1, optional offset equals 1 is the knockoff+ procedure Returns ------- - thres : float or np.inf + threshold : float or np.inf threshold level """ if offset not in (0, 1): raise ValueError("'offset' must be either 0 or 1") - t_mesh = np.sort(np.abs(test_score[test_score != 0])) - for t in t_mesh: - false_pos = np.sum(test_score <= -t) - selected = np.sum(test_score >= t) + threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) + threshold_mesh.append(np.inf) # if there is no solution, the threshold is inf + # find the right value of t for getting a good fdr + for thresold in threshold_mesh: + false_pos = np.sum(test_score <= -thresold) + selected = np.sum(test_score >= thresold) if (offset + false_pos) / np.maximum(selected, 1) <= fdr: - return t - - return np.inf + break + return thresold From d4f80c2497e07048e7e24929a4f9d27c9dfd5184 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:22:54 +0100 Subject: [PATCH 03/48] Update the variables --- src/hidimstat/gaussian_knockoff.py | 85 ++++++++++++++---------------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 68c8f036..90e59f37 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -1,17 +1,19 @@ -"""Generation of model-x knockoff following equi-correlated method or -optimization scheme following Barber et al. (2015). Requires cvxopt. +"""GRequires cvxopt. """ import warnings import numpy as np from sklearn.covariance import GraphicalLassoCV, empirical_covariance, ledoit_wolf -from sklearn.utils.validation import check_memory -def gaussian_knockoff_generation(X, mu, Sigma, method="equi", memory=None, seed=None): - """Generate second-order knockoff variables using equi-correlated method. +def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): + """ + Generate second-order knockoff variables using equi-correlated method. Reference: Candes et al. (2016), Barber et al. (2015) + + Generation of model-x knockoff following equi-correlated method or + optimization scheme following Barber et al. (2015). Parameters ---------- @@ -24,7 +26,7 @@ def gaussian_knockoff_generation(X, mu, Sigma, method="equi", memory=None, seed= method: str method to generate gaussian knockoff - Sigma : 2D ndarray (n_samples, n_features) + sigma : 2D ndarray (n_samples, n_features) empirical covariance matrix Returns @@ -32,33 +34,29 @@ def gaussian_knockoff_generation(X, mu, Sigma, method="equi", memory=None, seed= X_tilde : 2D ndarray (n_samples, n_features) knockoff design matrix """ - memory = check_memory(memory) - n_samples, n_features = X.shape - if method == "equi": - Diag_s = np.diag(_s_equi(Sigma)) - else: - raise ValueError( - "{} is not a valid knockoff " "contriction method".format(method) - ) - Sigma_inv_s = np.linalg.solve(Sigma, Diag_s) + # create a uniform noise for all the data + rng = np.random.RandomState(seed) + U_tilde = rng.randn(n_samples, n_features) + + Diag_s = np.diag(_s_equi(sigma, tol=tol)) + + Sigma_inv_s = np.linalg.solve(sigma, Diag_s) # First part on the RHS of equation 1.4 in Barber & Candes (2015) Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) # To calculate the Cholesky decomposition later on - Sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) - while not _is_posdef(Sigma_tilde): - Sigma_tilde += 1e-10 * np.eye(n_features) + sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) + while not _is_posdef(sigma_tilde): + sigma_tilde += 1e-10 * np.eye(n_features) warnings.warn( "The conditional covariance matrix for knockoffs is not positive " - "definite. Adding minor positive value to the matrix." + "definite. Adding minor positive value to the matrix.", UserWarning ) - rng = np.random.RandomState(seed) - U_tilde = rng.randn(n_samples, n_features) # Equation 1.4 in Barber & Candes (2015) - X_tilde = Mu_tilde + np.dot(U_tilde, np.linalg.cholesky(Sigma_tilde)) + X_tilde = Mu_tilde + np.dot(U_tilde, np.linalg.cholesky(sigma_tilde)) return X_tilde @@ -82,13 +80,12 @@ def _is_posdef(X, tol=1e-14): eig_value = np.linalg.eigvalsh(X) return np.all(eig_value > tol) - -def _cov_to_corr(Sigma): +def _cov_to_corr(sigma): """Convert covariance matrix to correlation matrix Parameters ---------- - Sigma : 2D ndarray (n_features, n_features) + sigma : 2D ndarray (n_features, n_features) Covariance matrix Returns @@ -97,47 +94,45 @@ def _cov_to_corr(Sigma): Transformed correlation matrix """ - features_std = np.sqrt(np.diag(Sigma)) - Scale = np.outer(features_std, features_std) - - Corr_matrix = Sigma / Scale + features_std = np.sqrt(np.diag(sigma)) + scale = np.outer(features_std, features_std) - return Corr_matrix + corr_matrix = sigma / scale + return corr_matrix -def _estimate_distribution(X, shrink=False, cov_estimator="ledoit_wolf"): - alphas = [1e-3, 1e-2, 1e-1, 1] +def _estimate_distribution(X, shrink=False, cov_estimator="ledoit_wolf", alphas = [1e-3, 1e-2, 1e-1, 1], tol=1e-14): mu = X.mean(axis=0) - Sigma = empirical_covariance(X) + sigma = empirical_covariance(X) - if shrink or not _is_posdef(Sigma): + if shrink or not _is_posdef(sigma, tol=tol): if cov_estimator == "ledoit_wolf": - Sigma_shrink = ledoit_wolf(X, assume_centered=True)[0] + sigma_shrink = ledoit_wolf(X, assume_centered=True)[0] elif cov_estimator == "graph_lasso": model = GraphicalLassoCV(alphas=alphas) - Sigma_shrink = model.fit(X).covariance_ + sigma_shrink = model.fit(X).covariance_ else: raise ValueError( "{} is not a valid covariance estimated method".format(cov_estimator) ) - return mu, Sigma_shrink + return mu, sigma_shrink - return mu, Sigma + return mu, sigma -def _s_equi(Sigma): +def _s_equi(sigma, tol=1e-14): """Estimate diagonal matrix of correlation between real and knockoff variables using equi-correlated equation Parameters ---------- - Sigma : 2D ndarray (n_features, n_features) + sigma : 2D ndarray (n_features, n_features) empirical covariance matrix calculated from original design matrix Returns @@ -145,9 +140,9 @@ def _s_equi(Sigma): 1D ndarray (n_features, ) vector of diagonal values of estimated matrix diag{s} """ - n_features = Sigma.shape[0] + n_features = sigma.shape[0] - G = _cov_to_corr(Sigma) + G = _cov_to_corr(sigma) eig_value = np.linalg.eigvalsh(G) lambda_min = np.min(eig_value[0]) S = np.ones(n_features) * min(2 * lambda_min, 1) @@ -157,13 +152,13 @@ def _s_equi(Sigma): while psd is False: # if all eigval > 0 then the matrix is psd - psd = _is_posdef(2 * G - np.diag(S * (1 - s_eps))) + psd = _is_posdef(2 * G - np.diag(S * (1 - s_eps)), tol=tol) if not psd: if s_eps == 0: - s_eps = 1e-08 + s_eps = np.eps # avoid zeros else: s_eps *= 10 S = S * (1 - s_eps) - return S * np.diag(Sigma) + return S * np.diag(sigma) From 47dbe2483466c3c9ada7230797ebf9b3bd4990fd Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:51:10 +0100 Subject: [PATCH 04/48] Put all the knockoff test together --- test/test_knockoff_aggregation.py | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/test/test_knockoff_aggregation.py b/test/test_knockoff_aggregation.py index a2984fc7..a94e334a 100644 --- a/test/test_knockoff_aggregation.py +++ b/test/test_knockoff_aggregation.py @@ -1,8 +1,10 @@ from hidimstat import knockoff_aggregation, model_x_knockoff +from hidimstat.gaussian_knockoff import gaussian_knockoff_generation from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power import numpy as np import pytest +from sklearn.covariance import LedoitWolf, GraphicalLassoCV n = 500 p = 100 @@ -114,3 +116,50 @@ def test_knockoff_aggregation(): method="e-values", random_state="test", ) + +def test_model_x_knockoff(): + seed = 42 + fdr = 0.2 + n = 300 + p = 300 + X, y, _, non_zero = simu_data(n, p, seed=seed) + ko_result = model_x_knockoff(X, y, fdr=fdr, seed=seed + 1) + fdp, power = cal_fdp_power(ko_result, non_zero) + + assert fdp <= 0.2 + assert power > 0.7 + + +def test_estimate_distribution(): + SEED = 42 + fdr = 0.1 + n = 100 + p = 50 + X, y, _, non_zero = simu_data(n, p, seed=SEED) + mu = X.mean(axis=0) + Sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ + + + assert mu.size == p + assert Sigma.shape == (p, p) + + mu = X.mean(axis=0) + Sigma = GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X).covariance_ + + assert mu.size == p + assert Sigma.shape == (p, p) + + +def test_gaussian_knockoff_equi(): + SEED = 42 + fdr = 0.1 + n = 100 + p = 50 + X, y, _, non_zero = simu_data(n, p, seed=SEED) + mu = X.mean(axis=0) + Sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ + + X_tilde = gaussian_knockoff_generation(X, mu, Sigma, seed=SEED * 2) + + assert X_tilde.shape == (n, p) + From 81e64c2d2079c182cccffa540781a8f3ea8ac7e8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:52:39 +0100 Subject: [PATCH 05/48] Fix a bug --- src/hidimstat/stat_coef_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/stat_coef_diff.py b/src/hidimstat/stat_coef_diff.py index a54a8f90..ae4d04a1 100644 --- a/src/hidimstat/stat_coef_diff.py +++ b/src/hidimstat/stat_coef_diff.py @@ -154,7 +154,7 @@ def _coef_diff_threshold(test_score, fdr=0.1, offset=1): raise ValueError("'offset' must be either 0 or 1") threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - threshold_mesh.append(np.inf) # if there is no solution, the threshold is inf + np.append(threshold_mesh, np.inf) # if there is no solution, the threshold is inf # find the right value of t for getting a good fdr for thresold in threshold_mesh: false_pos = np.sum(test_score <= -thresold) From d6ded884c68346e0a38cb9990c53a5549d34d792 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:53:17 +0100 Subject: [PATCH 06/48] remove the function for estimating covariance --- src/hidimstat/gaussian_knockoff.py | 66 ++++----------------------- src/hidimstat/knockoff_aggregation.py | 31 +++++++------ src/hidimstat/knockoffs.py | 15 ++++-- 3 files changed, 37 insertions(+), 75 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 90e59f37..6dc8df85 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -4,7 +4,6 @@ import warnings import numpy as np -from sklearn.covariance import GraphicalLassoCV, empirical_covariance, ledoit_wolf def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): @@ -12,9 +11,6 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): Generate second-order knockoff variables using equi-correlated method. Reference: Candes et al. (2016), Barber et al. (2015) - Generation of model-x knockoff following equi-correlated method or - optimization scheme following Barber et al. (2015). - Parameters ---------- X: 2D ndarray (n_samples, n_features) @@ -48,7 +44,8 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) # To calculate the Cholesky decomposition later on sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) - while not _is_posdef(sigma_tilde): + # test is the matrix is positive definite + while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): sigma_tilde += 1e-10 * np.eye(n_features) warnings.warn( "The conditional covariance matrix for knockoffs is not positive " @@ -61,25 +58,6 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): return X_tilde -def _is_posdef(X, tol=1e-14): - """Check a matrix is positive definite by calculating eigenvalue of the - matrix - - Parameters - ---------- - X : 2D ndarray, shape (n_samples x n_features) - Matrix to check - - tol : float, optional - minimum threshold for eigenvalue - - Returns - ------- - True or False - """ - eig_value = np.linalg.eigvalsh(X) - return np.all(eig_value > tol) - def _cov_to_corr(sigma): """Convert covariance matrix to correlation matrix @@ -102,30 +80,6 @@ def _cov_to_corr(sigma): return corr_matrix -def _estimate_distribution(X, shrink=False, cov_estimator="ledoit_wolf", alphas = [1e-3, 1e-2, 1e-1, 1], tol=1e-14): - - mu = X.mean(axis=0) - sigma = empirical_covariance(X) - - if shrink or not _is_posdef(sigma, tol=tol): - - if cov_estimator == "ledoit_wolf": - sigma_shrink = ledoit_wolf(X, assume_centered=True)[0] - - elif cov_estimator == "graph_lasso": - model = GraphicalLassoCV(alphas=alphas) - sigma_shrink = model.fit(X).covariance_ - - else: - raise ValueError( - "{} is not a valid covariance estimated method".format(cov_estimator) - ) - - return mu, sigma_shrink - - return mu, sigma - - def _s_equi(sigma, tol=1e-14): """Estimate diagonal matrix of correlation between real and knockoff variables using equi-correlated equation @@ -147,17 +101,17 @@ def _s_equi(sigma, tol=1e-14): lambda_min = np.min(eig_value[0]) S = np.ones(n_features) * min(2 * lambda_min, 1) - psd = False + psd = np.all(np.linalg.eigvalsh(2 * G - np.diag(S)) > tol) s_eps = 0 while psd is False: - # if all eigval > 0 then the matrix is psd - psd = _is_posdef(2 * G - np.diag(S * (1 - s_eps)), tol=tol) - if not psd: - if s_eps == 0: - s_eps = np.eps # avoid zeros - else: - s_eps *= 10 + if s_eps == 0: + s_eps = np.eps # avoid zeros + else: + s_eps *= 10 + # if all eigval > 0 then the matrix is positive define + psd = np.all(np.linalg.eigvalsh(2 * G - np.diag(S * (1 - s_eps))) > tol) + S = S * (1 - s_eps) diff --git a/src/hidimstat/knockoff_aggregation.py b/src/hidimstat/knockoff_aggregation.py index ceeed56a..a3802f98 100644 --- a/src/hidimstat/knockoff_aggregation.py +++ b/src/hidimstat/knockoff_aggregation.py @@ -4,9 +4,9 @@ from joblib import Parallel, delayed from sklearn.preprocessing import StandardScaler from sklearn.utils import check_random_state -from sklearn.utils.validation import check_memory +from sklearn.covariance import LedoitWolf -from .gaussian_knockoff import _estimate_distribution, gaussian_knockoff_generation +from .gaussian_knockoff import gaussian_knockoff_generation from .stat_coef_diff import stat_coef_diff, _coef_diff_threshold from .utils import fdr_threshold, quantile_aggregation @@ -15,15 +15,13 @@ def knockoff_aggregation( X, y, centered=True, - shrink=False, - construct_method="equi", fdr=0.1, fdr_control="bhq", reshaping_function=None, offset=1, method="quantile", statistic="lasso_cv", - cov_estimator="ledoit_wolf", + cov_estimator=LedoitWolf(assume_centered=True), joblib_verbose=0, n_bootstraps=25, n_jobs=1, @@ -75,6 +73,12 @@ def knockoff_aggregation( The method to calculate knockoff test score. cov_estimator : srt, default="ledoitwolf" The method of empirical covariance matrix estimation. + example: + - LedoitWolf(assume_centered=True) + - GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) + - EmpiricalCovariance() + + joblib_versobe : int, default=0 The verbosity level of joblib: if non zero, progress messages are printed. Above 50, the output is sent to stdout. The frequency of the @@ -125,18 +129,15 @@ def knockoff_aggregation( if centered: X = StandardScaler().fit_transform(X) - mu, Sigma = _estimate_distribution(X, shrink=shrink, cov_estimator=cov_estimator) - - mem = check_memory(memory) - stat_coef_diff_cached = mem.cache( - stat_coef_diff, ignore=["n_jobs", "joblib_verbose"] - ) + # estimation of X distribution + mu = X.mean(axis=0) + Sigma = cov_estimator.fit(X).covariance_ if n_bootstraps == 1: X_tilde = gaussian_knockoff_generation( - X, mu, Sigma, method=construct_method, memory=memory, seed=random_state + X, mu, Sigma, seed=random_state ) - ko_stat = stat_coef_diff_cached(X, X_tilde, y, method=statistic) + ko_stat = stat_coef_diff(X, X_tilde, y, method=statistic) pvals = _empirical_pval(ko_stat, offset) threshold = fdr_threshold(pvals, fdr=fdr, method=fdr_control) selected = np.where(pvals <= threshold)[0] @@ -157,13 +158,13 @@ def knockoff_aggregation( parallel = Parallel(n_jobs, verbose=joblib_verbose) X_tildes = parallel( delayed(gaussian_knockoff_generation)( - X, mu, Sigma, method=construct_method, memory=memory, seed=seed + X, mu, Sigma, seed=seed ) for seed in seed_list ) ko_stats = parallel( - delayed(stat_coef_diff_cached)(X, X_tildes[i], y, method=statistic) + delayed(stat_coef_diff)(X, X_tildes[i], y, method=statistic) for i in range(n_bootstraps) ) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 6dfc0984..40965806 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -8,20 +8,21 @@ """ import numpy as np from sklearn.preprocessing import StandardScaler +from sklearn.covariance import LedoitWolf -from .gaussian_knockoff import _estimate_distribution, gaussian_knockoff_generation +from .gaussian_knockoff import gaussian_knockoff_generation from .stat_coef_diff import _coef_diff_threshold, stat_coef_diff + def model_x_knockoff( X, y, fdr=0.1, offset=1, statistics="lasso_cv", - shrink=False, centered=True, - cov_estimator="ledoit_wolf", + cov_estimator=LedoitWolf(assume_centered=True), verbose=False, n_jobs=1, seed=None, @@ -57,6 +58,10 @@ def model_x_knockoff( cov_estimator : str, optional method of empirical covariance matrix estimation + example: + - LedoitWolf(assume_centered=True) + - GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) + - EmpiricalCovariance() seed : int or None, optional random seed used to generate Gaussian knockoff variable @@ -83,7 +88,9 @@ def model_x_knockoff( if centered: X = StandardScaler().fit_transform(X) - mu, sigma = _estimate_distribution(X, shrink=shrink, cov_estimator=cov_estimator) + # estimation of X distribution + mu = X.mean(axis=0) + sigma = cov_estimator.fit(X).covariance_ X_tilde = gaussian_knockoff_generation( X, mu, sigma, seed=seed From 801c5f95740170021e9bafc471a3541983d40d4d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:57:09 +0100 Subject: [PATCH 07/48] Remove unnecessary file --- test/test_generate_knockoff.py | 37 ---------------------------------- test/test_model_x_knockoff.py | 18 ----------------- 2 files changed, 55 deletions(-) delete mode 100644 test/test_generate_knockoff.py delete mode 100644 test/test_model_x_knockoff.py diff --git a/test/test_generate_knockoff.py b/test/test_generate_knockoff.py deleted file mode 100644 index 54056f79..00000000 --- a/test/test_generate_knockoff.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -# Authors: Binh Nguyen - -from hidimstat.data_simulation import simu_data -from hidimstat.gaussian_knockoff import ( - _estimate_distribution, - gaussian_knockoff_generation, -) - -SEED = 42 -fdr = 0.1 - - -def test_estimate_distribution(): - n = 100 - p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, cov_estimator="ledoit_wolf") - - assert mu.size == p - assert Sigma.shape == (p, p) - - mu, Sigma = _estimate_distribution(X, cov_estimator="graph_lasso") - - assert mu.size == p - assert Sigma.shape == (p, p) - - -def test_gaussian_knockoff_equi(): - n = 100 - p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu, Sigma = _estimate_distribution(X, cov_estimator="ledoit_wolf") - - X_tilde = gaussian_knockoff_generation(X, mu, Sigma, method="equi", seed=SEED * 2) - - assert X_tilde.shape == (n, p) diff --git a/test/test_model_x_knockoff.py b/test/test_model_x_knockoff.py deleted file mode 100644 index 504e4694..00000000 --- a/test/test_model_x_knockoff.py +++ /dev/null @@ -1,18 +0,0 @@ -from hidimstat import model_x_knockoff -from hidimstat.data_simulation import simu_data -from hidimstat.utils import cal_fdp_power - -seed = 42 -fdr = 0.2 - - -def test_model_x_knockoff(): - - n = 300 - p = 300 - X, y, _, non_zero = simu_data(n, p, seed=seed) - ko_result = model_x_knockoff(X, y, fdr=fdr, seed=seed + 1) - fdp, power = cal_fdp_power(ko_result, non_zero) - - assert fdp <= 0.2 - assert power > 0.7 From 4c61dd768a3b3784465fca8d0eb7d16b67a58348 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 10:59:57 +0100 Subject: [PATCH 08/48] Remove a function --- src/hidimstat/gaussian_knockoff.py | 34 +++++++----------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 6dc8df85..b08b758e 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -58,28 +58,6 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): return X_tilde -def _cov_to_corr(sigma): - """Convert covariance matrix to correlation matrix - - Parameters - ---------- - sigma : 2D ndarray (n_features, n_features) - Covariance matrix - - Returns - ------- - Corr_matrix : 2D ndarray (n_features, n_features) - Transformed correlation matrix - """ - - features_std = np.sqrt(np.diag(sigma)) - scale = np.outer(features_std, features_std) - - corr_matrix = sigma / scale - - return corr_matrix - - def _s_equi(sigma, tol=1e-14): """Estimate diagonal matrix of correlation between real and knockoff variables using equi-correlated equation @@ -96,12 +74,16 @@ def _s_equi(sigma, tol=1e-14): """ n_features = sigma.shape[0] - G = _cov_to_corr(sigma) - eig_value = np.linalg.eigvalsh(G) + # Convert covariance matrix to correlation matrix + features_std = np.sqrt(np.diag(sigma)) + scale = np.outer(features_std, features_std) + corr_matrix = sigma / scale + + eig_value = np.linalg.eigvalsh(corr_matrix) lambda_min = np.min(eig_value[0]) S = np.ones(n_features) * min(2 * lambda_min, 1) - psd = np.all(np.linalg.eigvalsh(2 * G - np.diag(S)) > tol) + psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S)) > tol) s_eps = 0 while psd is False: @@ -110,7 +92,7 @@ def _s_equi(sigma, tol=1e-14): else: s_eps *= 10 # if all eigval > 0 then the matrix is positive define - psd = np.all(np.linalg.eigvalsh(2 * G - np.diag(S * (1 - s_eps))) > tol) + psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S * (1 - s_eps))) > tol) S = S * (1 - s_eps) From b890cba6c2555584042a6204e92e5b9e1053c0a3 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 14:26:38 +0100 Subject: [PATCH 09/48] comparison with original code --- src/hidimstat/gaussian_knockoff.py | 17 +++++++++++++---- src/hidimstat/knockoffs.py | 12 ++++++++---- src/hidimstat/stat_coef_diff.py | 18 +++++++++++++----- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index b08b758e..5129a4c2 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -1,6 +1,3 @@ -"""GRequires cvxopt. -""" - import warnings import numpy as np @@ -11,6 +8,9 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): Generate second-order knockoff variables using equi-correlated method. Reference: Candes et al. (2016), Barber et al. (2015) + original impelmentation: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_gaussian.R + Parameters ---------- X: 2D ndarray (n_samples, n_features) @@ -43,7 +43,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): # First part on the RHS of equation 1.4 in Barber & Candes (2015) Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) # To calculate the Cholesky decomposition later on - sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) + sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) #TODO extra operation Sigma_inv_s.dot(Diag_s) ?????? # test is the matrix is positive definite while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): sigma_tilde += 1e-10 * np.eye(n_features) @@ -61,6 +61,9 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): def _s_equi(sigma, tol=1e-14): """Estimate diagonal matrix of correlation between real and knockoff variables using equi-correlated equation + + original implementation: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/solve_equi.R Parameters ---------- @@ -75,12 +78,18 @@ def _s_equi(sigma, tol=1e-14): n_features = sigma.shape[0] # Convert covariance matrix to correlation matrix + # as example see cov2corr from statsmodels features_std = np.sqrt(np.diag(sigma)) scale = np.outer(features_std, features_std) corr_matrix = sigma / scale + eig_value = np.linalg.eigvalsh(corr_matrix) lambda_min = np.min(eig_value[0]) + # check if the matrix is positive-defined + if lambda_min < 0: + raise Exception('The covariance matrix is not positive-definite.') + S = np.ones(n_features) * min(2 * lambda_min, 1) psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S)) > tol) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 40965806..468b0619 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -31,6 +31,10 @@ def model_x_knockoff( It's an inference procedure to control False Discoveries Rate, based on :footcite:t:`candesPanningGoldModelX2017` + + original implementation: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R Parameters ---------- @@ -59,9 +63,9 @@ def model_x_knockoff( cov_estimator : str, optional method of empirical covariance matrix estimation example: - - LedoitWolf(assume_centered=True) - - GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) - - EmpiricalCovariance() + - LedoitWolf(assume_centered=True) # shrink the matrix + - GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) # skrink the matrix + - EmpiricalCovariance() better for positive defined mastrix seed : int or None, optional random seed used to generate Gaussian knockoff variable @@ -103,6 +107,6 @@ def model_x_knockoff( selected = np.where(test_score >= thres)[0] if verbose: - return selected, test_score, thres, X_tilde + return selected, thres, test_score, X_tilde return selected diff --git a/src/hidimstat/stat_coef_diff.py b/src/hidimstat/stat_coef_diff.py index ae4d04a1..04341b4f 100644 --- a/src/hidimstat/stat_coef_diff.py +++ b/src/hidimstat/stat_coef_diff.py @@ -31,6 +31,9 @@ def stat_coef_diff( W_j = abs(beta_j) - abs(beta_tilda_j) with j = 1, ..., n_features + + orriginal implementation: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/stats_glmnet_cv.R Parameters ---------- @@ -76,6 +79,7 @@ def stat_coef_diff( X_ko = np.column_stack([X, X_tilde]) if method == "lasso_cv": lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) + # min is different from original implementation lambdas = np.linspace(lambda_max * np.exp(-n_lambdas), lambda_max, n_lambdas) # check for replacing all of this by provided BaseSearchCV of scikit-learn @@ -133,6 +137,9 @@ def stat_coef_diff( def _coef_diff_threshold(test_score, fdr=0.1, offset=1): """Calculate the knockoff threshold based on the procedure stated in the article. + + original code: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R Parameters ---------- @@ -154,11 +161,12 @@ def _coef_diff_threshold(test_score, fdr=0.1, offset=1): raise ValueError("'offset' must be either 0 or 1") threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - np.append(threshold_mesh, np.inf) # if there is no solution, the threshold is inf + np.concatenate([[0], threshold_mesh, [np.inf]]) # if there is no solution, the threshold is inf # find the right value of t for getting a good fdr - for thresold in threshold_mesh: - false_pos = np.sum(test_score <= -thresold) - selected = np.sum(test_score >= thresold) + threshold = 0. + for threshold in threshold_mesh: + false_pos = np.sum(test_score <= -threshold) + selected = np.sum(test_score >= threshold) if (offset + false_pos) / np.maximum(selected, 1) <= fdr: break - return thresold + return threshold From b6fe9481c5faa971849b02e00468885582545816 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 15:34:14 +0100 Subject: [PATCH 10/48] improve docstring and function --- src/hidimstat/knockoffs.py | 189 ++++++++++++++++++++++---------- src/hidimstat/stat_coef_diff.py | 172 ----------------------------- src/hidimstat/stat_tools.py | 39 +++++++ 3 files changed, 171 insertions(+), 229 deletions(-) delete mode 100644 src/hidimstat/stat_coef_diff.py diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 468b0619..79c7c573 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -9,104 +9,179 @@ import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.covariance import LedoitWolf +from sklearn.linear_model import LassoCV +from sklearn.model_selection import KFold from .gaussian_knockoff import gaussian_knockoff_generation -from .stat_coef_diff import _coef_diff_threshold, stat_coef_diff +from hidimstat.stat_tools import coef_diff_threshold +def preconfigure_estimator(estimator, X, X_tilde, y, n_lambdas=10): + """ + Configure the estimator for Model-X knockoffs. + + This function sets up the regularization path for the Lasso estimator + based on the input data and the number of lambdas to use. The regularization + path is defined by a sequence of lambda values, which control the amount + of shrinkage applied to the coefficient estimates. + + Parameters: + estimator: sklearn.linear_model._base.LinearModel + An instance of a linear model estimator from sklearn.linear_model. + This estimator will be used to fit the data and compute the test + statistics. In this case, it should be an instance of LassoCV. + + X: 2D ndarray (n_samples, n_features) + The original design matrix. + + X_tilde: 2D ndarray (n_samples, n_features) + The knockoff design matrix. + + y: 1D ndarray (n_samples, ) + The target vector. + + n_lambdas: int, optional (default=10) + The number of lambda values to use to instansiate the cross validation. + + Returns: + None + The function modifies the `estimator` object in-place. + + Raises: + TypeError + If the estimator is not an instance of LassoCV. + + Note: + This function is specifically designed for the Model-X knockoffs procedure, + which combines original and knockoff variables in the design matrix. + """ + if type(estimator).__name__ != 'LassoCV': + raise TypeError('You should not use this function for configure your estimator') + + n_features = X.shape[1] + X_ko = np.column_stack([X, X_tilde]) + lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) + lambdas = np.linspace(lambda_max * np.exp(-n_lambdas), lambda_max, n_lambdas) + estimator.alphas = lambdas + def model_x_knockoff( X, y, fdr=0.1, offset=1, - statistics="lasso_cv", + estimator=LassoCV(n_jobs=None, verbose=0,max_iter=1000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-8), + preconfigure_estimator = preconfigure_estimator, centered=True, cov_estimator=LedoitWolf(assume_centered=True), verbose=False, - n_jobs=1, seed=None, ): - """Model-X Knockoff + """ + Model-X Knockoff - It's an inference procedure to control False Discoveries Rate, - based on :footcite:t:`candesPanningGoldModelX2017` - - original implementation: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R + This module implements the Model-X knockoff inference procedure, which is an approach + to control the False Discovery Rate (FDR) based on Candes et al. (2017). The original + implementation can be found at https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R Parameters ---------- X : 2D ndarray (n_samples, n_features) - design matrix + The design matrix. y : 1D ndarray (n_samples, ) - response vector - - fdr : float, optional - desired controlled FDR level - - offset : int, 0 or 1, optional - offset to calculate knockoff threshold, offset = 1 is equivalent to - knockoff+ - - statistics : str, optional - method to calculate knockoff test score - - shrink : bool, optional - whether to shrink the empirical covariance matrix - - centered : bool, optional - whether to standardize the data before doing the inference procedure - - cov_estimator : str, optional - method of empirical covariance matrix estimation - example: - - LedoitWolf(assume_centered=True) # shrink the matrix - - GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) # skrink the matrix - - EmpiricalCovariance() better for positive defined mastrix - - seed : int or None, optional - random seed used to generate Gaussian knockoff variable - - Returns - ------- - selected : 1D array, int - vector of index of selected variables + The target vector. + + fdr : float, optional (default=0.1) + The desired controlled False Discovery Rate (FDR) level. + + offset : int, 0 or 1, optional (default=1) + The offset to calculate the knockoff threshold. An offset of 1 is equivalent to + knockoff+. + + estimator : sklearn estimator instance, optional + The estimator used for fitting the data and computing the test statistics. + This can be any estimator with a `fit` method that accepts a 2D array and + a 1D array, and a `coef_` attribute that returns a 1D array of coefficients. + Examples include LassoCV, LogisticRegressionCV, and LinearRegression. + + preconfigure_estimator : function, optional + A function that configures the estimator for the Model-X knockoff procedure. + If provided, this function will be called with the estimator, X, X_tilde, and y + as arguments, and should modify the estimator in-place. + + centered : bool, optional (default=True) + Whether to standardize the data before performing the inference procedure. + + cov_estimator : sklearn covariance estimator instance, optional + The method used to estimate the empirical covariance matrix. This can be any + estimator with a `fit` method that accepts a 2D array and a `covariance_` + attribute that returns a 2D array of the estimated covariance matrix. + Examples include LedoitWolf and GraphicalLassoCV. + + verbose : bool, optional (default=False) + Whether to return additional information along with the selected variables. + If True, the function will return a tuple containing the selected variables, + the threshold, the test scores, and the knockoff design matrix. If False, + the function will return only the selected variables. + + seed : int or None, optional (default=None) + The random seed used to generate the Gaussian knockoff variables. + Returns + ------- + selected : 1D array, int + A vector of indices of the selected variables. + + threshold : float + The knockoff threshold. test_score : 1D array, (n_features, ) - vector of test statistic - - thres : float - knockoff threshold + A vector of test statistics. X_tilde : 2D array, (n_samples, n_features) - knockoff design matrix + The knockoff design matrix. References ---------- .. footbibliography:: """ - + n_samples, n_features = X.shape + if centered: X = StandardScaler().fit_transform(X) # estimation of X distribution + # original implementation: + # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R mu = X.mean(axis=0) sigma = cov_estimator.fit(X).covariance_ + # Create knockoff variables X_tilde = gaussian_knockoff_generation( X, mu, sigma, seed=seed ) - test_score = stat_coef_diff( - X, X_tilde, y, method=statistics, n_jobs=n_jobs - ) - thres = _coef_diff_threshold(test_score, fdr=fdr, offset=offset) - - selected = np.where(test_score >= thres)[0] + + # Compute statistic base on a cross validation + # original implementation: + # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/stats_glmnet_cv.R + X_ko = np.column_stack([X, X_tilde]) + if preconfigure_estimator is not None: + preconfigure_estimator(estimator, X, X_tilde, y) + estimator.fit(X_ko, y) + if hasattr(estimator, 'coef_'): + coef = np.ravel(estimator.coef_) + elif hasattr(estimator, 'best_estimator_') and hasattr(estimator.best_estimator_, 'coef_'): + coef = np.ravel(estimator.best_estimator_.coef_) # for CV object + test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) + + # run the knockoff filter + threshold = coef_diff_threshold(test_score, fdr=fdr, offset=offset) + selected = np.where(test_score >= threshold)[0] if verbose: - return selected, thres, test_score, X_tilde - - return selected + return selected, threshold, test_score, X_tilde, estimator + else: + return selected diff --git a/src/hidimstat/stat_coef_diff.py b/src/hidimstat/stat_coef_diff.py deleted file mode 100644 index 04341b4f..00000000 --- a/src/hidimstat/stat_coef_diff.py +++ /dev/null @@ -1,172 +0,0 @@ -# -*- coding: utf-8 -*- -# Authors: Binh Nguyen - -import numpy as np -from sklearn.linear_model import LassoCV, LogisticRegressionCV -from sklearn.model_selection import KFold - -# from sklearn.linear_model._coordinate_descent import _alpha_grid -# from sklearn.model_selection import GridSearchCV - - -def stat_coef_diff( - X, - X_tilde, - y, - method="lasso_cv", - n_splits=5, - n_jobs=1, - n_lambdas=10, - n_iter=1000, - joblib_verbose=0, - return_coef=False, - solver="liblinear", - seed=0, - tol=1e-8 -): - """Calculate test statistic by doing estimation with Cross-validation on - concatenated design matrix [X X_tilde] to find coefficients [beta - beta_tilda]. The test statistic is then: - - W_j = abs(beta_j) - abs(beta_tilda_j) - - with j = 1, ..., n_features - - orriginal implementation: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/stats_glmnet_cv.R - - Parameters - ---------- - X : 2D ndarray (n_samples, n_features) - Original design matrix - - X_tilde : 2D ndarray (n_samples, n_features) - Knockoff design matrix - - y : 1D ndarray (n_samples, ) - Response vector - - loss : str, optional - if the response vector is continuous, the loss used should be - 'least_square', otherwise - if the response vector is binary, it should be 'logistic' - - n_splits : int, optional - number of cross-validation folds - - solver : str, optional - solver used by sklearn function LogisticRegressionCV - - n_regu : int, optional - number of regulation used in the regression problem - - return_coef : bool, optional - return regression coefficient if set to True - - tol: float, optional - tolerance - - Returns - ------- - test_score : 1D ndarray (n_features, ) - vector of test statistic - - coef: 1D ndarray (n_features * 2, ) - coefficients of the estimation problem - """ - - n_features = X.shape[1] - X_ko = np.column_stack([X, X_tilde]) - if method == "lasso_cv": - lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) - # min is different from original implementation - lambdas = np.linspace(lambda_max * np.exp(-n_lambdas), lambda_max, n_lambdas) - - # check for replacing all of this by provided BaseSearchCV of scikit-learn - # this can help to generalize the methods and reduce the parameters of the functions - # the only problems can be lambdas???? - cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) - - estimator = { - "lasso_cv": LassoCV( - alphas=lambdas, - n_jobs=n_jobs, - verbose=joblib_verbose, - max_iter=n_iter, - cv=cv, - tol=tol - ), - "logistic_l1": LogisticRegressionCV( - penalty="l1", - max_iter=n_iter, - solver=solver, - cv=cv, - n_jobs=n_jobs, - tol=tol, - ), - "logistic_l2": LogisticRegressionCV( - penalty="l2", - max_iter=n_iter, - n_jobs=n_jobs, - verbose=joblib_verbose, - cv=cv, - tol=tol, - ), - } - - try: - clf = estimator[method] - except KeyError: - print(f"{method} is not a valid estimator") - - clf.fit(X_ko, y) - - try: - coef = np.ravel(clf.coef_) - except AttributeError: - coef = np.ravel(clf.best_estimator_.coef_) # for GridSearchCV object - - test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) - - if return_coef: - return test_score, coef - - return test_score - - -def _coef_diff_threshold(test_score, fdr=0.1, offset=1): - """Calculate the knockoff threshold based on the procedure stated in the - article. - - original code: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R - - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - vector of test statistic - - fdr : float, optional - desired controlled FDR(false discovery rate) level - - offset : int, 0 or 1, optional - offset equals 1 is the knockoff+ procedure - - Returns - ------- - threshold : float or np.inf - threshold level - """ - if offset not in (0, 1): - raise ValueError("'offset' must be either 0 or 1") - - threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - np.concatenate([[0], threshold_mesh, [np.inf]]) # if there is no solution, the threshold is inf - # find the right value of t for getting a good fdr - threshold = 0. - for threshold in threshold_mesh: - false_pos = np.sum(test_score <= -threshold) - selected = np.sum(test_score >= threshold) - if (offset + false_pos) / np.maximum(selected, 1) <= fdr: - break - return threshold diff --git a/src/hidimstat/stat_tools.py b/src/hidimstat/stat_tools.py index 0ced13e9..51c1a6bf 100644 --- a/src/hidimstat/stat_tools.py +++ b/src/hidimstat/stat_tools.py @@ -390,3 +390,42 @@ def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib="norm"): ) return two_sided_pval, two_sided_pval_corr + + +def coef_diff_threshold(test_score, fdr=0.1, offset=1): + """ + Calculate the knockoff threshold based on the procedure stated in the + article. + + original code: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R + + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + vector of test statistic + + fdr : float, optional + desired controlled FDR(false discovery rate) level + + offset : int, 0 or 1, optional + offset equals 1 is the knockoff+ procedure + + Returns + ------- + threshold : float or np.inf + threshold level + """ + if offset not in (0, 1): + raise ValueError("'offset' must be either 0 or 1") + + threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) + np.concatenate([[0], threshold_mesh, [np.inf]]) # if there is no solution, the threshold is inf + # find the right value of t for getting a good fdr + threshold = 0. + for threshold in threshold_mesh: + false_pos = np.sum(test_score <= -threshold) + selected = np.sum(test_score >= threshold) + if (offset + false_pos) / np.maximum(selected, 1) <= fdr: + break + return threshold From 43a3e9a6b4b57023e4ee2cdc12392ccdb4c94fbd Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:10:26 +0100 Subject: [PATCH 11/48] Merge file for knockoff together --- src/hidimstat/knockoff_aggregation.py | 248 ---------------------- src/hidimstat/knockoffs.py | 285 +++++++++++++++++++++++--- src/hidimstat/stat_tools.py | 39 ---- 3 files changed, 261 insertions(+), 311 deletions(-) delete mode 100644 src/hidimstat/knockoff_aggregation.py diff --git a/src/hidimstat/knockoff_aggregation.py b/src/hidimstat/knockoff_aggregation.py deleted file mode 100644 index a3802f98..00000000 --- a/src/hidimstat/knockoff_aggregation.py +++ /dev/null @@ -1,248 +0,0 @@ -# -*- coding: utf-8 -*- -# Authors: Binh Nguyen -import numpy as np -from joblib import Parallel, delayed -from sklearn.preprocessing import StandardScaler -from sklearn.utils import check_random_state -from sklearn.covariance import LedoitWolf - -from .gaussian_knockoff import gaussian_knockoff_generation -from .stat_coef_diff import stat_coef_diff, _coef_diff_threshold -from .utils import fdr_threshold, quantile_aggregation - - -def knockoff_aggregation( - X, - y, - centered=True, - fdr=0.1, - fdr_control="bhq", - reshaping_function=None, - offset=1, - method="quantile", - statistic="lasso_cv", - cov_estimator=LedoitWolf(assume_centered=True), - joblib_verbose=0, - n_bootstraps=25, - n_jobs=1, - adaptive_aggregation=False, - gamma=0.5, - gamma_min=0.05, - verbose=False, - memory=None, - random_state=None, -): - """ - Aggregation of Multiple knockoffs - - This function implements the aggregation of multiple knockoffs introduced by - :footcite:t:`pmlr-v119-nguyen20a` - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The input samples. - y : array-like of shape (n_samples,), - The target values (class labels in classification, real numbers in - regression). - centered : bool, default=True - Whether to standardize the data before doing the inference procedure. - shrink : bool, default=False - Whether to shrink the empirical covariance matrix. - construct_method : str, default="equi" - The knockoff construction methods. The options include: - - "equi" for equi-correlated knockoff - - "sdp" for optimization scheme - fdr : float, default=0.1 - The desired controlled FDR level - fdr_control : srt, default="bhq" - The control method for False Discovery Rate (FDR). The options include: - - "bhq" for Standard Benjamini-Hochberg procedure - - "bhy" for Benjamini-Hochberg-Yekutieli procedure - - "ebh" for e-BH procedure - reshaping_function : , default=None - The reshaping function defined in :footcite:t:`bhy_2001`. - offset : int, 0 or 1, optional - The offset to calculate knockoff threshold, offset = 1 is equivalent to - knockoff+. - method : srt, default="quantile" - The method to compute the statistical measures. The options include: - - "quantile" for p-values - - "e-values" for e-values - statistic : srt, default="lasso_cv" - The method to calculate knockoff test score. - cov_estimator : srt, default="ledoitwolf" - The method of empirical covariance matrix estimation. - example: - - LedoitWolf(assume_centered=True) - - GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) - - EmpiricalCovariance() - - - joblib_versobe : int, default=0 - The verbosity level of joblib: if non zero, progress messages are - printed. Above 50, the output is sent to stdout. The frequency of the - messages increases with the verbosity level. If it more than 10, all - iterations are reported. - n_bootstraps : int, default=25 - The number of bootstrapping iterations. - n_jobs : int, default=1 - The number of workers for parallel processing. - adaptive_aggregation : bool, default=False - Whether to apply the adaptive version of the quantile aggregation method - as in :footcite:t:`Meinshausen_2008`. - gamma: float, default=0.5 - The percentile value used for aggregation. - gamma_min : float, default=0.05 - The minimum percentile value used for aggregation. - verbose : bool, default=False - Whether to return the corresponding p-values of the variables along with - the list of selected variables. - memory : str or joblib.Memory object, default=None - Used to cache the output of the computation of the clustering - and the inference. By default, no caching is done. If a string is - given, it is the path to the caching directory. - random_state : int, default=None - Fixing the seeds of the random generator. - - Returns - ------- - selected : 1D array, int - The vector of index of selected variables. - aggregated_pval: 1D array, float - The vector of aggregated p-values. - pvals: 1D array, float - The vector of the corresponding p-values. - aggregated_eval: 1D array, float - The vector of aggregated e-values. - evals: 1D array, float - The vector of the corresponding e-values. - - References - ---------- - .. footbibliography:: - - """ - # unnecessary to have n_jobs > number of bootstraps - n_jobs = min(n_bootstraps, n_jobs) - - if centered: - X = StandardScaler().fit_transform(X) - - # estimation of X distribution - mu = X.mean(axis=0) - Sigma = cov_estimator.fit(X).covariance_ - - if n_bootstraps == 1: - X_tilde = gaussian_knockoff_generation( - X, mu, Sigma, seed=random_state - ) - ko_stat = stat_coef_diff(X, X_tilde, y, method=statistic) - pvals = _empirical_pval(ko_stat, offset) - threshold = fdr_threshold(pvals, fdr=fdr, method=fdr_control) - selected = np.where(pvals <= threshold)[0] - - if verbose: - return selected, pvals - - return selected - - if isinstance(random_state, (int, np.int32, np.int64)): - rng = check_random_state(random_state) - elif random_state is None: - rng = check_random_state(0) - else: - raise TypeError("Wrong type for random_state") - - seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) - parallel = Parallel(n_jobs, verbose=joblib_verbose) - X_tildes = parallel( - delayed(gaussian_knockoff_generation)( - X, mu, Sigma, seed=seed - ) - for seed in seed_list - ) - - ko_stats = parallel( - delayed(stat_coef_diff)(X, X_tildes[i], y, method=statistic) - for i in range(n_bootstraps) - ) - - if method == "e-values": - evals = np.array( - [_empirical_eval(ko_stats[i], fdr / 2, offset) for i in range(n_bootstraps)] - ) - - aggregated_eval = np.mean(evals, axis=0) - threshold = fdr_threshold(aggregated_eval, fdr=fdr, method="ebh") - selected = np.where(aggregated_eval >= threshold)[0] - - if verbose: - return selected, aggregated_eval, evals - - return selected - - if method == "quantile": - pvals = np.array( - [_empirical_pval(ko_stats[i], offset) for i in range(n_bootstraps)] - ) - - aggregated_pval = quantile_aggregation( - pvals, gamma=gamma, gamma_min=gamma_min, adaptive=adaptive_aggregation - ) - - threshold = fdr_threshold( - aggregated_pval, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = np.where(aggregated_pval <= threshold)[0] - - if verbose: - return selected, aggregated_pval, pvals - - return selected - - -def _empirical_pval(test_score, offset=1): - """ - This function implements the computation of the empirical p-values - """ - pvals = [] - n_features = test_score.size - - if offset not in (0, 1): - raise ValueError("'offset' must be either 0 or 1") - - test_score_inv = -test_score - for i in range(n_features): - if test_score[i] <= 0: - pvals.append(1) - else: - pvals.append( - (offset + np.sum(test_score_inv >= test_score[i])) / n_features - ) - - return np.array(pvals) - - -def _empirical_eval(test_score, fdr=0.1, offset=1): - """ - This function implements the computation of the empirical e-values - """ - evals = [] - n_features = test_score.size - - if offset not in (0, 1): - raise ValueError("'offset' must be either 0 or 1") - - ko_thr = _coef_diff_threshold(test_score, fdr=fdr, offset=offset) - - for i in range(n_features): - if test_score[i] < ko_thr: - evals.append(0) - else: - evals.append(n_features / (offset + np.sum(test_score <= -ko_thr))) - - return np.array(evals) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 79c7c573..333c5d27 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -11,12 +11,14 @@ from sklearn.covariance import LedoitWolf from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold +from sklearn.utils import check_random_state +from joblib import Parallel, delayed -from .gaussian_knockoff import gaussian_knockoff_generation -from hidimstat.stat_tools import coef_diff_threshold +from hidimstat.gaussian_knockoff import gaussian_knockoff_generation, repeat_gaussian_knockoff_generation +from hidimstat.utils import fdr_threshold, quantile_aggregation -def preconfigure_estimator(estimator, X, X_tilde, y, n_lambdas=10): +def preconfigure_estimator_LaccosCV(estimator, X, X_tilde, y, n_lambdas=10): """ Configure the estimator for Model-X knockoffs. @@ -68,15 +70,12 @@ def preconfigure_estimator(estimator, X, X_tilde, y, n_lambdas=10): def model_x_knockoff( X, y, - fdr=0.1, - offset=1, estimator=LassoCV(n_jobs=None, verbose=0,max_iter=1000, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-8), - preconfigure_estimator = preconfigure_estimator, + preconfigure_estimator = preconfigure_estimator_LaccosCV, centered=True, cov_estimator=LedoitWolf(assume_centered=True), - verbose=False, seed=None, ): """ @@ -94,19 +93,19 @@ def model_x_knockoff( y : 1D ndarray (n_samples, ) The target vector. - - fdr : float, optional (default=0.1) - The desired controlled False Discovery Rate (FDR) level. - - offset : int, 0 or 1, optional (default=1) - The offset to calculate the knockoff threshold. An offset of 1 is equivalent to - knockoff+. - + estimator : sklearn estimator instance, optional The estimator used for fitting the data and computing the test statistics. This can be any estimator with a `fit` method that accepts a 2D array and a 1D array, and a `coef_` attribute that returns a 1D array of coefficients. Examples include LassoCV, LogisticRegressionCV, and LinearRegression. + Configuration example: + LassoCV(alphas=lambdas, n_jobs=None, verbose=0, max_iter=1000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-8) + LogisticRegressionCV(penalty="l1", max_iter=1000, solver="liblinear", + cv=KFold(n_splits=5, shuffle=True, random_state=0), n_jobs=None, tol=1e-8) + LogisticRegressionCV(penalty="l2", max_iter=1000, n_jobs=None, + verbose=0, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-8,) preconfigure_estimator : function, optional A function that configures the estimator for the Model-X knockoff procedure. @@ -148,8 +147,6 @@ def model_x_knockoff( ---------- .. footbibliography:: """ - n_samples, n_features = X.shape - if centered: X = StandardScaler().fit_transform(X) @@ -164,9 +161,174 @@ def model_x_knockoff( X, mu, sigma, seed=seed ) + test_score = _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator) + + return test_score + +def knockoff_aggregation( + X, + y, + estimator=LassoCV(n_jobs=None, verbose=0,max_iter=1000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-8), + preconfigure_estimator = preconfigure_estimator_LaccosCV, + centered=True, + cov_estimator=LedoitWolf(assume_centered=True), + joblib_verbose=0, + n_bootstraps=25, + n_jobs=1, + random_state=None, +): + assert n_bootstraps <= 1, "the number of bootstraps should at least higher than 1" + # unnecessary to have n_jobs > number of bootstraps + n_jobs = min(n_bootstraps, n_jobs) + parallel = Parallel(n_jobs, verbose=joblib_verbose) + + # get the seed for the different run + if isinstance(random_state, (int, np.int32, np.int64)): + rng = check_random_state(random_state) + elif random_state is None: + rng = check_random_state(0) + else: + raise TypeError("Wrong type for random_state") + seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) + + if centered: + X = StandardScaler().fit_transform(X) + + # estimation of X distribution + mu = X.mean(axis=0) + sigma = cov_estimator.fit(X).covariance_ + + # Create knockoff variables + X_tilde, (Mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( + X, mu, sigma, seed=seed_list[0], repeat=True + ) + X_tildes = parallel( + delayed(repeat_gaussian_knockoff_generation)( + Mu_tilde, sigma_tilde_decompose, seed=seed + ) + for seed in seed_list[1:] + ) + X_tildes.insert(0, X_tilde) + + test_scores = parallel( + delayed(_stat_coefficient_diff)(X, X_tilde[i], y, estimator, preconfigure_estimator) + for i in range(n_bootstraps) + ) + + return test_scores + + +def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): + """ + Calculate the p-values and return the selected variables based on the knockoff filter. + + Parameters + ---------- + test_score : 1D array, (n_features, ) + A vector of test statistics. + + fdr : float, optional (default=0.1) + The desired controlled False Discovery Rate (FDR) level. + + offset : int, 0 or 1, optional (default=1) + The offset to calculate the knockoff threshold. An offset of 1 is equivalent to + knockoff+. + + selection_only : bool, optional (default=True) + Whether to return only the selected variables or additional information. + If True, the function will return only the selected variables. If False, + the function will return the selected variables, the threshold, and the test scores. + + Returns + ------- + selected : 1D array, int + A vector of indices of the selected variables. + + threshold : float + The knockoff threshold. + + test_score : 1D array, (n_features, ) + A vector of test statistics. + + Notes + ----- + This function calculates the knockoff threshold based on the test statistics and the + desired FDR level. It then identifies the selected variables based on the threshold. + """ + if offset not in (0, 1): + raise ValueError("'offset' must be either 0 or 1") + + # run the knockoff filter + threshold = _knockoff_threshold(test_score, fdr=fdr, offset=offset) + selected = np.where(test_score >= threshold)[0] + + if selection_only: + return selected + else: + return selected, threshold + + +def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, selection_only=True): + """ + This function implements the computation of the empirical p-values + """ + pvals = _empirical_pval(test_score, offset) + threshold = fdr_threshold(pvals, fdr=fdr, method=fdr_control) + selected = np.where(pvals <= threshold)[0] + + if selection_only: + return selected + else: + return selected, pvals + + +def model_x_knockoff_bootstrap_e_value(test_score, fdr=0.1, offset=1, selection_only=True): + n_bootstraps = len(test_score) + evals = np.array( + [_empirical_eval(test_score[i], fdr / 2, offset) for i in range(n_bootstraps)] + ) + + aggregated_eval = np.mean(evals, axis=0) + threshold = fdr_threshold(aggregated_eval, fdr=fdr, method="ebh") + selected = np.where(aggregated_eval >= threshold)[0] + + if selection_only: + return selected + else: + return selected, aggregated_eval, evals + + +def model_x_knockoff_bootstrap_quantile(test_score, fdr=0.1, fdr_control="bhq", reshaping_function=None, adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, offset=1, selection_only=True): + n_bootstraps = len(test_score) + pvals = np.array( + [_empirical_pval(test_score[i], offset) for i in range(n_bootstraps)] + ) + + aggregated_pval = quantile_aggregation( + pvals, gamma=gamma, gamma_min=gamma_min, adaptive=adaptive_aggregation + ) + + threshold = fdr_threshold( + aggregated_pval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = np.where(aggregated_pval <= threshold)[0] + + if selection_only: + return selected + else: + return selected, aggregated_pval, pvals + + +def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None): # Compute statistic base on a cross validation # original implementation: # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/stats_glmnet_cv.R + n_samples, n_features = X.shape X_ko = np.column_stack([X, X_tilde]) if preconfigure_estimator is not None: preconfigure_estimator(estimator, X, X_tilde, y) @@ -176,12 +338,87 @@ def model_x_knockoff( elif hasattr(estimator, 'best_estimator_') and hasattr(estimator.best_estimator_, 'coef_'): coef = np.ravel(estimator.best_estimator_.coef_) # for CV object test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) + return test_score + + +def _knockoff_threshold(test_score, fdr=0.1, offset=1): + """ + Calculate the knockoff threshold based on the procedure stated in the + article. - # run the knockoff filter - threshold = coef_diff_threshold(test_score, fdr=fdr, offset=offset) - selected = np.where(test_score >= threshold)[0] + original code: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R - if verbose: - return selected, threshold, test_score, X_tilde, estimator - else: - return selected + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + vector of test statistic + + fdr : float, optional + desired controlled FDR(false discovery rate) level + + offset : int, 0 or 1, optional + offset equals 1 is the knockoff+ procedure + + Returns + ------- + threshold : float or np.inf + threshold level + """ + if offset not in (0, 1): + raise ValueError("'offset' must be either 0 or 1") + + threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) + np.concatenate([[0], threshold_mesh, [np.inf]]) # if there is no solution, the threshold is inf + # find the right value of t for getting a good fdr + threshold = 0. + for threshold in threshold_mesh: + false_pos = np.sum(test_score <= -threshold) + selected = np.sum(test_score >= threshold) + if (offset + false_pos) / np.maximum(selected, 1) <= fdr: + break + return threshold + +def _empirical_knockoff_pval(test_score, offset=1): + """ + This function implements the computation of the empirical p-values + from knockoff test + """ + pvals = [] + n_features = test_score.size + + if offset not in (0, 1): + raise ValueError("'offset' must be either 0 or 1") + + test_score_inv = -test_score + for i in range(n_features): + if test_score[i] <= 0: + pvals.append(1) + else: + pvals.append( + (offset + np.sum(test_score_inv >= test_score[i])) / n_features + ) + + return np.array(pvals) + + +def _empirical_konckoff_eval(test_score, fdr=0.1, offset=1): + """ + This function implements the computation of the empirical e-values + from knockoff test + """ + evals = [] + n_features = test_score.size + + if offset not in (0, 1): + raise ValueError("'offset' must be either 0 or 1") + + ko_thr = _knockoff_threshold(test_score, fdr=fdr, offset=offset) + + for i in range(n_features): + if test_score[i] < ko_thr: + evals.append(0) + else: + evals.append(n_features / (offset + np.sum(test_score <= -ko_thr))) + + return np.array(evals) \ No newline at end of file diff --git a/src/hidimstat/stat_tools.py b/src/hidimstat/stat_tools.py index 51c1a6bf..0ced13e9 100644 --- a/src/hidimstat/stat_tools.py +++ b/src/hidimstat/stat_tools.py @@ -390,42 +390,3 @@ def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib="norm"): ) return two_sided_pval, two_sided_pval_corr - - -def coef_diff_threshold(test_score, fdr=0.1, offset=1): - """ - Calculate the knockoff threshold based on the procedure stated in the - article. - - original code: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R - - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - vector of test statistic - - fdr : float, optional - desired controlled FDR(false discovery rate) level - - offset : int, 0 or 1, optional - offset equals 1 is the knockoff+ procedure - - Returns - ------- - threshold : float or np.inf - threshold level - """ - if offset not in (0, 1): - raise ValueError("'offset' must be either 0 or 1") - - threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - np.concatenate([[0], threshold_mesh, [np.inf]]) # if there is no solution, the threshold is inf - # find the right value of t for getting a good fdr - threshold = 0. - for threshold in threshold_mesh: - false_pos = np.sum(test_score <= -threshold) - selected = np.sum(test_score >= threshold) - if (offset + false_pos) / np.maximum(selected, 1) <= fdr: - break - return threshold From a116b533901b3820dec9553d356df5818764f11a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:11:19 +0100 Subject: [PATCH 12/48] Add function for repeat the gaussian knockoff --- src/hidimstat/gaussian_knockoff.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 5129a4c2..5f759245 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -1,9 +1,8 @@ import warnings - import numpy as np -def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): +def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=False): """ Generate second-order knockoff variables using equi-correlated method. Reference: Candes et al. (2016), Barber et al. (2015) @@ -53,9 +52,24 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): ) # Equation 1.4 in Barber & Candes (2015) - X_tilde = Mu_tilde + np.dot(U_tilde, np.linalg.cholesky(sigma_tilde)) + sigma_tilde_decompose = np.linalg.cholesky(sigma_tilde) + X_tilde = Mu_tilde + np.dot(U_tilde, sigma_tilde_decompose) + + if not repeat: + return X_tilde + else: + return X_tilde, (Mu_tilde, sigma_tilde_decompose) + + +def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): + n_samples, n_features = Mu_tilde.shape/2 + # create a uniform noise for all the data + rng = np.random.RandomState(seed) + U_tilde = rng.randn(n_samples, n_features) + X_tilde = Mu_tilde + np.dot(U_tilde, sigma_tilde_decompose) return X_tilde + def _s_equi(sigma, tol=1e-14): From 16996758af95fd08e14225d4740603dd8f5c9f71 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:17:02 +0100 Subject: [PATCH 13/48] Put all the test for knockoff in one file --- test/{test_knockoff_aggregation.py => test_knockoff.py} | 0 test/test_stat_coef_diff.py | 1 - 2 files changed, 1 deletion(-) rename test/{test_knockoff_aggregation.py => test_knockoff.py} (100%) delete mode 100644 test/test_stat_coef_diff.py diff --git a/test/test_knockoff_aggregation.py b/test/test_knockoff.py similarity index 100% rename from test/test_knockoff_aggregation.py rename to test/test_knockoff.py diff --git a/test/test_stat_coef_diff.py b/test/test_stat_coef_diff.py deleted file mode 100644 index ad7310ae..00000000 --- a/test/test_stat_coef_diff.py +++ /dev/null @@ -1 +0,0 @@ -# To be done From db6098f660d01f6f926156675011f48ae48f7205 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:51:11 +0100 Subject: [PATCH 14/48] Include the new function in the init --- src/hidimstat/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index cdc21880..e1ca8c6c 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -3,8 +3,7 @@ from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso from .Dnn_learner_single import DnnLearnerSingle from .ensemble_clustered_inference import ensemble_clustered_inference -from .knockoff_aggregation import knockoff_aggregation -from .knockoffs import model_x_knockoff +from .knockoffs import model_x_knockoff, model_x_knockoff_aggregation, model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_quantile, model_x_knockoff_bootstrap_e_value from .multi_sample_split import aggregate_quantiles from .noise_std import group_reid, reid from .permutation_test import permutation_test_cv @@ -31,8 +30,12 @@ "ensemble_clustered_inference", "group_reid", "hd_inference", - "knockoff_aggregation", "model_x_knockoff", + "model_x_knockoff_aggregation", + "model_x_knockoff_filter", + "model_x_knockoff_pvalue", + "model_x_knockoff_bootstrap_quantile", + "model_x_knockoff_bootstrap_e_value", "multivariate_1D_simulation", "permutation_test_cv", "reid", From e38f376a7a1155a50525b872cad31a9c0c3949ef Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:51:46 +0100 Subject: [PATCH 15/48] Fix bug --- src/hidimstat/gaussian_knockoff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 5f759245..c861481e 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -62,7 +62,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): - n_samples, n_features = Mu_tilde.shape/2 + n_samples, n_features = Mu_tilde.shape # create a uniform noise for all the data rng = np.random.RandomState(seed) U_tilde = rng.randn(n_samples, n_features) From 886148075cde372cc7f50fca74425fcf199ec4b0 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:52:32 +0100 Subject: [PATCH 16/48] Fix bugs --- src/hidimstat/knockoffs.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 333c5d27..9f04960b 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -165,7 +165,8 @@ def model_x_knockoff( return test_score -def knockoff_aggregation( + +def model_x_knockoff_aggregation( X, y, estimator=LassoCV(n_jobs=None, verbose=0,max_iter=1000, @@ -179,7 +180,7 @@ def knockoff_aggregation( n_jobs=1, random_state=None, ): - assert n_bootstraps <= 1, "the number of bootstraps should at least higher than 1" + assert n_bootstraps > 1, "the number of bootstraps should at least higher than 1" # unnecessary to have n_jobs > number of bootstraps n_jobs = min(n_bootstraps, n_jobs) parallel = Parallel(n_jobs, verbose=joblib_verbose) @@ -213,7 +214,7 @@ def knockoff_aggregation( X_tildes.insert(0, X_tilde) test_scores = parallel( - delayed(_stat_coefficient_diff)(X, X_tilde[i], y, estimator, preconfigure_estimator) + delayed(_stat_coefficient_diff)(X, X_tildes[i], y, estimator, preconfigure_estimator) for i in range(n_bootstraps) ) @@ -274,7 +275,7 @@ def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, se """ This function implements the computation of the empirical p-values """ - pvals = _empirical_pval(test_score, offset) + pvals = _empirical_knockoff_pval(test_score, offset) threshold = fdr_threshold(pvals, fdr=fdr, method=fdr_control) selected = np.where(pvals <= threshold)[0] @@ -284,10 +285,10 @@ def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, se return selected, pvals -def model_x_knockoff_bootstrap_e_value(test_score, fdr=0.1, offset=1, selection_only=True): - n_bootstraps = len(test_score) +def model_x_knockoff_bootstrap_e_value(test_scores, fdr=0.1, offset=1, selection_only=True): + n_bootstraps = len(test_scores) evals = np.array( - [_empirical_eval(test_score[i], fdr / 2, offset) for i in range(n_bootstraps)] + [_empirical_knockoff_eval(test_scores[i], fdr / 2, offset) for i in range(n_bootstraps)] ) aggregated_eval = np.mean(evals, axis=0) @@ -300,10 +301,10 @@ def model_x_knockoff_bootstrap_e_value(test_score, fdr=0.1, offset=1, selection_ return selected, aggregated_eval, evals -def model_x_knockoff_bootstrap_quantile(test_score, fdr=0.1, fdr_control="bhq", reshaping_function=None, adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, offset=1, selection_only=True): - n_bootstraps = len(test_score) +def model_x_knockoff_bootstrap_quantile(test_scores, fdr=0.1, fdr_control="bhq", reshaping_function=None, adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, offset=1, selection_only=True): + n_bootstraps = len(test_scores) pvals = np.array( - [_empirical_pval(test_score[i], offset) for i in range(n_bootstraps)] + [_empirical_knockoff_pval(test_scores[i], offset) for i in range(n_bootstraps)] ) aggregated_pval = quantile_aggregation( @@ -402,7 +403,7 @@ def _empirical_knockoff_pval(test_score, offset=1): return np.array(pvals) -def _empirical_konckoff_eval(test_score, fdr=0.1, offset=1): +def _empirical_knockoff_eval(test_score, fdr=0.1, offset=1): """ This function implements the computation of the empirical e-values from knockoff test From a0d53b02b1c992e62ce90418b5ba1faa378a061b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 17:53:01 +0100 Subject: [PATCH 17/48] Fix test for new signature of the function --- test/test_knockoff.py | 97 ++++++++------------- test/test_stat_tools.py | 188 ---------------------------------------- 2 files changed, 34 insertions(+), 251 deletions(-) delete mode 100644 test/test_stat_tools.py diff --git a/test/test_knockoff.py b/test/test_knockoff.py index a94e334a..67747e01 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -1,4 +1,6 @@ -from hidimstat import knockoff_aggregation, model_x_knockoff +from hidimstat.knockoffs import model_x_knockoff_aggregation, model_x_knockoff,\ + model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_e_value,\ + model_x_knockoff_bootstrap_quantile from hidimstat.gaussian_knockoff import gaussian_knockoff_generation from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power @@ -6,24 +8,25 @@ import pytest from sklearn.covariance import LedoitWolf, GraphicalLassoCV -n = 500 -p = 100 -snr = 5 -n_bootstraps = 25 -fdr = 0.5 -X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) + def test_knockoff_aggregation(): - selected_verbose, aggregated_pval, pvals = knockoff_aggregation( - X, y, fdr=fdr, n_bootstraps=n_bootstraps, verbose=True, random_state=0 + n = 500 + p = 100 + snr = 5 + n_bootstraps = 25 + fdr = 0.5 + X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) + + test_scores = model_x_knockoff_aggregation( + X, y, n_bootstraps=n_bootstraps, random_state=0 ) - + selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile(test_scores, fdr=fdr, selection_only=False) + fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) - selected_no_verbose = knockoff_aggregation( - X, y, fdr=fdr, n_bootstraps=n_bootstraps, verbose=False, random_state=None - ) + selected_no_verbose = model_x_knockoff_bootstrap_quantile(test_scores, fdr=fdr, selection_only=True) fdp_no_verbose, power_no_verbose = cal_fdp_power( selected_no_verbose, non_zero_index @@ -34,52 +37,21 @@ def test_knockoff_aggregation(): assert power_verbose > 0.1 assert fdp_no_verbose < 0.5 assert power_no_verbose > 0.1 + np.testing.assert_array_equal(selected_no_verbose, selected_verbose) # Single AKO (or vanilla KO) (verbose vs no verbose) - selected_no_verbose = knockoff_aggregation( - X, y, fdr=fdr, verbose=False, n_bootstraps=1, random_state=5 - ) - - selected_verbose, pvals = knockoff_aggregation( - X, y, fdr=fdr, verbose=True, n_bootstraps=1, random_state=5 + test_bootstrap = model_x_knockoff_aggregation( + X, y, n_bootstraps=2, random_state=5 ) + test_no_bootstrap = model_x_knockoff(X, y, seed=np.random.RandomState(5).randint(1, np.iinfo(np.int32).max, 2)[0]) - selected_ko = model_x_knockoff(X, y, fdr=fdr, seed=5) - - np.testing.assert_array_equal(selected_no_verbose, selected_ko) - - fdp_no_verbose, power_no_verbose = cal_fdp_power( - selected_no_verbose, non_zero_index - ) - fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) - - assert fdp_verbose < 0.5 - assert power_verbose > 0.1 + np.testing.assert_array_equal(test_bootstrap[0], test_no_bootstrap) # Using e-values aggregation (verbose vs no verbose) - selected_verbose, aggregated_pval, pvals = knockoff_aggregation( - X, - y, - fdr=fdr, - n_bootstraps=n_bootstraps, - method="e-values", - verbose=True, - random_state=0, - ) - + selected_verbose, aggregated_eval, evals = model_x_knockoff_bootstrap_e_value(test_scores, fdr=fdr, selection_only=False) fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) - - selected_no_verbose = knockoff_aggregation( - X, - y, - fdr=fdr, - n_bootstraps=n_bootstraps, - method="e-values", - verbose=False, - random_state=None, - ) - + selected_no_verbose = model_x_knockoff_bootstrap_e_value(test_scores, fdr=fdr, selection_only=True) fdp_no_verbose, power_no_verbose = cal_fdp_power( selected_no_verbose, non_zero_index ) @@ -92,7 +64,7 @@ def test_knockoff_aggregation(): # Checking wrong type for random_state with pytest.raises(Exception): - _ = knockoff_aggregation( + _ = model_x_knockoff_aggregation( X, y, random_state="test", @@ -100,30 +72,26 @@ def test_knockoff_aggregation(): # Checking value for offset not belonging to (0,1) with pytest.raises(Exception): - _ = knockoff_aggregation( - X, - y, + _ = model_x_knockoff_bootstrap_quantile( + test_scores, offset=2, - method="quantile", - random_state="test", ) with pytest.raises(Exception): - _ = knockoff_aggregation( - X, - y, + _ = model_x_knockoff_bootstrap_e_value( + test_scores, offset=2, - method="e-values", - random_state="test", ) def test_model_x_knockoff(): + seed = 42 fdr = 0.2 n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) - ko_result = model_x_knockoff(X, y, fdr=fdr, seed=seed + 1) + test_score = model_x_knockoff(X, y, seed=seed + 1) + ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 @@ -131,6 +99,9 @@ def test_model_x_knockoff(): def test_estimate_distribution(): + """ + TODO replace the estimate distribution by calling knockoff function with them + """ SEED = 42 fdr = 0.1 n = 100 diff --git a/test/test_stat_tools.py b/test/test_stat_tools.py deleted file mode 100644 index 5e5f8fa0..00000000 --- a/test/test_stat_tools.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -Test the stat module -""" - -import numpy as np -from numpy.testing import assert_almost_equal, assert_equal - -from hidimstat.stat_tools import ( - _replace_infinity, - pval_corr_from_pval, - pval_from_cb, - pval_from_scale, - pval_from_two_sided_pval_and_sign, - two_sided_pval_from_cb, - two_sided_pval_from_pval, - two_sided_pval_from_zscore, - zscore_from_cb, - zscore_from_pval, -) - - -def test__replace_infinity(): - - x = np.asarray([10, np.inf, -np.inf]) - - # replace inf by the largest absolute value times two - x_clean = _replace_infinity(x) - expected = np.asarray([10, 20, -20]) - assert_equal(x_clean, expected) - - # replace inf by 40 - x_clean = _replace_infinity(x, replace_val=40) - expected = np.asarray([10, 40, -40]) - assert_equal(x_clean, expected) - - # replace inf by the largest absolute value plus one - x_clean = _replace_infinity(x, method="plus-one") - expected = np.asarray([10, 11, -11]) - assert_equal(x_clean, expected) - - -def test_pval_corr_from_pval(): - - pval = np.asarray([1.0, 0.025, 0.5]) - - # Correction for multiple testing: 3 features tested simultaneously - pval_corr = pval_corr_from_pval(pval) - expected = np.asarray([1.0, 0.075, 0.5]) - assert_almost_equal(pval_corr, expected, decimal=10) - - one_minus_pval = np.asarray([0.0, 0.975, 0.5]) - - # Correction for multiple testing: 3 features tested simultaneously - one_minus_pval_corr = pval_corr_from_pval(one_minus_pval) - expected = np.asarray([0.0, 0.925, 0.5]) - assert_almost_equal(one_minus_pval_corr, expected, decimal=10) - - -def test_pval_from_scale(): - - beta = np.asarray([-1.5, 1, 0]) - scale = np.asarray([0.25, 0.5, 0.5]) - - # Computing p-value and one minus the p-value. - pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_scale(beta, scale) - expected = np.asarray( - [[1.0, 0.022, 0.5], [1.0, 0.068, 0.5], [0.0, 0.978, 0.5], [0.0, 0.932, 0.5]] - ) - - assert_almost_equal(pval, expected[0], decimal=2) - assert_almost_equal(pval_corr, expected[1], decimal=2) - assert_almost_equal(one_minus_pval, expected[2], decimal=2) - assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2) - - -def test_zscore_from_cb(): - - cb_min = np.asarray([-2, 0, -1]) - cb_max = np.asarray([-1, 2, 1]) - - # Computing z-scores from 95% confidence-intervals assuming Gaussianity - zscore = zscore_from_cb(cb_min, cb_max) - expected = np.asarray([-5.87, 1.96, 0]) - - assert_almost_equal(zscore, expected, decimal=2) - - -def test_pval_from_cb(): - - cb_min = np.asarray([-2, 0, -1]) - cb_max = np.asarray([-1, 2, 1]) - - # Computing p-value and one minus the p-value. - pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb(cb_min, cb_max) - expected = np.asarray( - [[1.0, 0.025, 0.5], [1.0, 0.075, 0.5], [0.0, 0.975, 0.5], [0.0, 0.925, 0.5]] - ) - - assert_almost_equal(pval, expected[0], decimal=2) - assert_almost_equal(pval_corr, expected[1], decimal=2) - assert_almost_equal(one_minus_pval, expected[2], decimal=2) - assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2) - - -def test_two_sided_pval_from_zscore(): - - zscore = np.asarray([-5.87, 1.96, 0]) - - # Computing two-sided pval from z-scores assuming Gaussianity - two_sided_pval, two_sided_pval_corr = two_sided_pval_from_zscore(zscore) - expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) - - assert_almost_equal(two_sided_pval, expected[0], decimal=2) - assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) - - -def test_two_sided_pval_from_cb(): - - cb_min = np.asarray([-2, 0, -1]) - cb_max = np.asarray([-1, 2, 1]) - - # Computing two-sided pval from 95% confidence bounds assuming Gaussianity - two_sided_pval, two_sided_pval_corr = two_sided_pval_from_cb(cb_min, cb_max) - expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) - - assert_almost_equal(two_sided_pval, expected[0], decimal=2) - assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) - - -def test_zscore_from_pval(): - - pval = np.asarray([1.0, 0.025, 0.5, 0.975]) - - # Computing z-scores from p-value - zscore = zscore_from_pval(pval) - expected = _replace_infinity( - np.asarray([-np.inf, 1.96, 0, -1.96]), replace_val=40, method="plus-one" - ) - - assert_almost_equal(zscore, expected, decimal=2) - - pval = np.asarray([1.0, 0.025, 0.5, 0.975]) - one_minus_pval = np.asarray([0.0, 0.975, 0.5, 0.025]) - - # Computing z-scores from p-value and one minus the p-value - zscore = zscore_from_pval(pval, one_minus_pval) - expected = _replace_infinity( - np.asarray([-np.inf, 1.96, 0, -1.96]), replace_val=40, method="plus-one" - ) - - assert_almost_equal(zscore, expected, decimal=2) - - -def test_pval_from_two_sided_pval_and_sign(): - - two_sided_pval = np.asarray([0.025, 0.05, 0.5]) - parameter_sign = np.asarray([-1.0, 1.0, -1.0]) - - # One-sided p-values from two-sided p-value and sign. - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign) - ) - expected = np.asarray( - [ - [0.9875, 0.025, 0.75], - [0.9625, 0.075, 0.5], - [0.0125, 0.975, 0.25], - [0.0375, 0.925, 0.5], - ] - ) - - assert_equal(pval, expected[0]) - assert_almost_equal(pval_corr, expected[1]) - assert_equal(one_minus_pval, expected[2]) - assert_almost_equal(one_minus_pval_corr, expected[3]) - - -def test_two_sided_pval_from_pval(): - - pval = np.asarray([1.0, 0.025, 0.5]) - one_minus_pval = np.asarray([0.0, 0.975, 0.5]) - - # Two-sided p-value from one-side p-values. - two_sided_pval, two_sided_pval_corr = two_sided_pval_from_pval(pval, one_minus_pval) - expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) - - assert_almost_equal(two_sided_pval, expected[0], decimal=2) - assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) From 4bb9eb46a7a3205623204842540ed2b238f60c6a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 18:26:09 +0100 Subject: [PATCH 18/48] Improve the docstring --- doc_conf/references.bib | 49 ++++++-------- src/hidimstat/gaussian_knockoff.py | 103 +++++++++++++++++++++-------- 2 files changed, 94 insertions(+), 58 deletions(-) diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 8fa09830..779af7f6 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -77,22 +77,6 @@ @article{Ren_2023 eprint = {https://academic.oup.com/jrsssb/article-pdf/86/1/122/56629998/qkad085.pdf}, } -@article{Candes_2018, - author = {Candès, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi}, - title = "{Panning for Gold: ‘Model-X’ Knockoffs for High Dimensional Controlled Variable Selection}", - journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology}, - volume = {80}, - number = {3}, - pages = {551-577}, - year = {2018}, - month = {01}, - abstract = "{ Many contemporary large-scale applications involve building interpretable models linking a large set of potential covariates to a response in a non-linear fashion, such as when the response is binary. Although this modelling problem has been extensively studied, it remains unclear how to control the fraction of false discoveries effectively even in high dimensional logistic regression, not to mention general high dimensional non-linear models. To address such a practical problem, we propose a new framework of ‘model-X’ knockoffs, which reads from a different perspective the knockoff procedure that was originally designed for controlling the false discovery rate in linear models. Whereas the knockoffs procedure is constrained to homoscedastic linear models with n⩾p, the key innovation here is that model-X knockoffs provide valid inference from finite samples in settings in which the conditional distribution of the response is arbitrary and completely unknown. Furthermore, this holds no matter the number of covariates. Correct inference in such a broad setting is achieved by constructing knockoff variables probabilistically instead of geometrically. To do this, our approach requires that the covariates are random (independent and identically distributed rows) with a distribution that is known, although we provide preliminary experimental evidence that our procedure is robust to unknown or estimated distributions. To our knowledge, no other procedure solves the controlled variable selection problem in such generality but, in the restricted settings where competitors exist, we demonstrate the superior power of knockoffs through simulations. Finally, we apply our procedure to data from a case–control study of Crohn's disease in the UK, making twice as many discoveries as the original analysis of the same data.}", - issn = {1369-7412}, - doi = {10.1111/rssb.12265}, - url = {https://doi.org/10.1111/rssb.12265}, - eprint = {https://academic.oup.com/jrsssb/article-pdf/80/3/551/49274696/jrsssb\_80\_3\_551.pdf}, -} - @article{breimanRandomForests2001, title = {Random {{Forests}}}, author = {Breiman, Leo}, @@ -148,22 +132,27 @@ @article{miPermutationbasedIdentificationImportant2021 keywords = {Cancer,Data mining,Machine learning,Statistical methods}, } -@article{candesPanningGoldModelX2017, - title = {Panning for {{Gold}}: {{Model-X Knockoffs}} for {{High-dimensional Controlled Variable Selection}}}, - shorttitle = {Panning for {{Gold}}}, - author = {Candes, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi}, - year = {2017}, - month = dec, - journal = {arXiv:1610.02351 [math, stat]}, - eprint = {1610.02351}, - primaryclass = {math, stat}, - urldate = {2022-01-12}, - abstract = {Many contemporary large-scale applications involve building interpretable models linking a large set of potential covariates to a response in a nonlinear fashion, such as when the response is binary. Although this modeling problem has been extensively studied, it remains unclear how to effectively control the fraction of false discoveries even in high-dimensional logistic regression, not to mention general high-dimensional nonlinear models. To address such a practical problem, we propose a new framework of \$model\$-\$X\$ knockoffs, which reads from a different perspective the knockoff procedure (Barber and Cand{\textbackslash}`es, 2015) originally designed for controlling the false discovery rate in linear models. Whereas the knockoffs procedure is constrained to homoscedastic linear models with \$n{\textbackslash}ge p\$, the key innovation here is that model-X knockoffs provide valid inference from finite samples in settings in which the conditional distribution of the response is arbitrary and completely unknown. Furthermore, this holds no matter the number of covariates. Correct inference in such a broad setting is achieved by constructing knockoff variables probabilistically instead of geometrically. To do this, our approach requires the covariates be random (independent and identically distributed rows) with a distribution that is known, although we provide preliminary experimental evidence that our procedure is robust to unknown/estimated distributions. To our knowledge, no other procedure solves the \$controlled\$ variable selection problem in such generality, but in the restricted settings where competitors exist, we demonstrate the superior power of knockoffs through simulations. Finally, we apply our procedure to data from a case-control study of Crohn's disease in the United Kingdom, making twice as many discoveries as the original analysis of the same data.}, - archiveprefix = {arxiv}, - keywords = {Mathematics - Statistics Theory,Statistics - Applications,Statistics - Methodology}, - file = {/home/ahmad/Zotero/storage/YZ23F3Q5/Candes et al. - 2017 - Panning for Gold Model-X Knockoffs for High-dimen.pdf;/home/ahmad/Zotero/storage/ZSN64F6N/1610.html} +@article{candes2018panning, + title={Panning for gold:‘model-X’knockoffs for high dimensional controlled variable selection}, + author={Candes, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi}, + journal={Journal of the Royal Statistical Society Series B: Statistical Methodology}, + volume={80}, + number={3}, + pages={551--577}, + year={2018}, + publisher={Oxford University Press} + } + +@article{barber2015controlling, + title={Controlling the false discovery rate via knockoffs}, + author={Barber, Rina Foygel and Cand{\`e}s, Emmanuel J}, + journal={The Annals of statistics}, + pages={2055--2085}, + year={2015}, + publisher={JSTOR} } + @article{liuFastPowerfulConditional2021, title = {Fast and {{Powerful Conditional Randomization Testing}} via {{Distillation}}}, author = {Liu, Molei and Katsevich, Eugene and Janson, Lucas and Ramdas, Aaditya}, diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index c861481e..1ad3a436 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -4,30 +4,43 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=False): """ - Generate second-order knockoff variables using equi-correlated method. - Reference: Candes et al. (2016), Barber et al. (2015) - - original impelmentation: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_gaussian.R - + Generate second-order knockoff variables using the equi-correlated method. + + This function generates knockoff variables for a given design matrix X, + using the equi-correlated method described in :cite:`barber2015controlling` and + :cite:`candes2018panning`. The function takes as input the design matrix X, + the vector of empirical mean values mu, and the empirical covariance + matrix sigma. It returns the knockoff variables X_tilde. + Parameters ---------- X: 2D ndarray (n_samples, n_features) - original design matrix + The original design matrix. mu : 1D ndarray (n_features, ) - vector of empirical mean values - - method: str - method to generate gaussian knockoff + A vector of empirical mean values. sigma : 2D ndarray (n_samples, n_features) - empirical covariance matrix + The empirical covariance matrix. + + seed : int, optional + A random seed for generating the uniform noise used to create the knockoff variables. + + tol : float, optional + A tolerance value used for numerical stability in the calculation of the Cholesky decomposition. + + repeat : bool, optional + If True, the function returns the values used to generate the knockoff variables (Mu_tilde and sigma_tilde_decompose), + which can be used to generate additional knockoff variables without having to recompute them. Returns ------- X_tilde : 2D ndarray (n_samples, n_features) - knockoff design matrix + The knockoff variables. + + References + ---------- + .. footbibliography:: """ n_samples, n_features = X.shape @@ -43,7 +56,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) # To calculate the Cholesky decomposition later on sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) #TODO extra operation Sigma_inv_s.dot(Diag_s) ?????? - # test is the matrix is positive definite + # test is the matrix is positive definite while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): sigma_tilde += 1e-10 * np.eye(n_features) warnings.warn( @@ -62,7 +75,33 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): + """ + Generate additional knockoff variables using pre-computed values. + + This function generates additional knockoff variables using pre-computed values returned by the + gaussian_knockoff_generation function with repeat=True. It takes as input Mu_tilde and + sigma_tilde_decompose, which were returned by gaussian_knockoff_generation, and a random seed. + It returns the new knockoff variables X_tilde. + + Parameters + ---------- + Mu_tilde : 2D ndarray (n_samples, n_features) + The matrix of means used to generate the knockoff variables, returned by gaussian_knockoff_generation. + + sigma_tilde_decompose : 2D ndarray (n_features, n_features) + The Cholesky decomposition of the covariance matrix used to generate the knockoff variables, + returned by gaussian_knockoff_generation. + + seed : int + A random seed for generating the uniform noise used to create the knockoff variables. + + Returns + ------- + X_tilde : 2D ndarray (n_samples, n_features) + The knockoff variables. + """ n_samples, n_features = Mu_tilde.shape + # create a uniform noise for all the data rng = np.random.RandomState(seed) U_tilde = rng.randn(n_samples, n_features) @@ -73,21 +112,31 @@ def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): def _s_equi(sigma, tol=1e-14): - """Estimate diagonal matrix of correlation between real and knockoff - variables using equi-correlated equation - - original implementation: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/solve_equi.R + """ + Estimate the diagonal matrix of correlation between real and knockoff variables using the equi-correlated equation. + + This function estimates the diagonal matrix of correlation between real and knockoff variables using the + equi-correlated equation described in :cite:`barber2015controlling` and + :cite:`candes2018panning`. It takes as input the empirical covariance matrix sigma and a tolerance value tol, + and returns a vector of diagonal values of the estimated matrix diag{s}. Parameters ---------- sigma : 2D ndarray (n_features, n_features) - empirical covariance matrix calculated from original design matrix + The empirical covariance matrix calculated from the original design matrix. + + tol : float, optional + A tolerance value used for numerical stability in the calculation of the eigenvalues of the correlation matrix. Returns ------- 1D ndarray (n_features, ) - vector of diagonal values of estimated matrix diag{s} + A vector of diagonal values of the estimated matrix diag{s}. + + Raises + ------ + Exception + If the covariance matrix is not positive-definite. """ n_features = sigma.shape[0] @@ -96,28 +145,26 @@ def _s_equi(sigma, tol=1e-14): features_std = np.sqrt(np.diag(sigma)) scale = np.outer(features_std, features_std) corr_matrix = sigma / scale - - + eig_value = np.linalg.eigvalsh(corr_matrix) lambda_min = np.min(eig_value[0]) # check if the matrix is positive-defined if lambda_min < 0: raise Exception('The covariance matrix is not positive-definite.') - - S = np.ones(n_features) * min(2 * lambda_min, 1) + S = np.ones(n_features) * min(2 * lambda_min, 1) + psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S)) > tol) s_eps = 0 - while psd is False: if s_eps == 0: s_eps = np.eps # avoid zeros - else: + s_eps = np.eps s_eps *= 10 # if all eigval > 0 then the matrix is positive define psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S * (1 - s_eps))) > tol) - S = S * (1 - s_eps) return S * np.diag(sigma) + From d21f46a8632f1265a3c0663e8a10ee731a21e2dc Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 18:40:21 +0100 Subject: [PATCH 19/48] Improve docstring knockoff --- src/hidimstat/knockoffs.py | 366 +++++++++++++++++++++++++++++-------- 1 file changed, 290 insertions(+), 76 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 9f04960b..b759c40b 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -27,33 +27,37 @@ def preconfigure_estimator_LaccosCV(estimator, X, X_tilde, y, n_lambdas=10): path is defined by a sequence of lambda values, which control the amount of shrinkage applied to the coefficient estimates. - Parameters: - estimator: sklearn.linear_model._base.LinearModel + Parameters + ---------- + estimator : sklearn.linear_model._base.LinearModel An instance of a linear model estimator from sklearn.linear_model. This estimator will be used to fit the data and compute the test statistics. In this case, it should be an instance of LassoCV. - X: 2D ndarray (n_samples, n_features) + X : 2D ndarray (n_samples, n_features) The original design matrix. - X_tilde: 2D ndarray (n_samples, n_features) + X_tilde : 2D ndarray (n_samples, n_features) The knockoff design matrix. - y: 1D ndarray (n_samples, ) + y : 1D ndarray (n_samples, ) The target vector. - n_lambdas: int, optional (default=10) - The number of lambda values to use to instansiate the cross validation. + n_lambdas : int, optional (default=10) + The number of lambda values to use to instantiate the cross-validation. - Returns: + Returns + ------- None The function modifies the `estimator` object in-place. - Raises: + Raises + ------ TypeError If the estimator is not an instance of LassoCV. - Note: + Notes + ----- This function is specifically designed for the Model-X knockoffs procedure, which combines original and knockoff variables in the design matrix. """ @@ -82,9 +86,10 @@ def model_x_knockoff( Model-X Knockoff This module implements the Model-X knockoff inference procedure, which is an approach - to control the False Discovery Rate (FDR) based on Candes et al. (2017). The original + to control the False Discovery Rate (FDR) based on :cite:`candes2018panning`. The original implementation can be found at https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R + The noisy variable are generate with a second-order knockoff variables using equi-correlated method. Parameters ---------- @@ -93,8 +98,8 @@ def model_x_knockoff( y : 1D ndarray (n_samples, ) The target vector. - - estimator : sklearn estimator instance, optional + + estimator : sklearn estimator instance or a cross validation instance, optional The estimator used for fitting the data and computing the test statistics. This can be any estimator with a `fit` method that accepts a 2D array and a 1D array, and a `coef_` attribute that returns a 1D array of coefficients. @@ -121,27 +126,18 @@ def model_x_knockoff( attribute that returns a 2D array of the estimated covariance matrix. Examples include LedoitWolf and GraphicalLassoCV. - verbose : bool, optional (default=False) - Whether to return additional information along with the selected variables. - If True, the function will return a tuple containing the selected variables, - the threshold, the test scores, and the knockoff design matrix. If False, - the function will return only the selected variables. - seed : int or None, optional (default=None) The random seed used to generate the Gaussian knockoff variables. - Returns - ------- - selected : 1D array, int - A vector of indices of the selected variables. - - threshold : float - The knockoff threshold. + Returns + ------- test_score : 1D array, (n_features, ) A vector of test statistics. - X_tilde : 2D array, (n_samples, n_features) - The knockoff design matrix. + Notes + ----- + This function calculates the test statistics based on the difference in absolute + coefficient values between the original and knockoff variables. References ---------- @@ -180,6 +176,56 @@ def model_x_knockoff_aggregation( n_jobs=1, random_state=None, ): + """ + Model-X Knockoff Aggregation + + This function generates multiple sets of Gaussian knockoff variables and calculates + the test statistics for each set. It then aggregates the test statistics across + the sets to improve stability and power. + + Parameters + ---------- + X : 2D ndarray (n_samples, n_features) + The design matrix. + + y : 1D ndarray (n_samples, ) + The target vector. + + estimator : sklearn estimator instance or a cross validation instance, optional + The estimator used for fitting the data and computing the test statistics. + + preconfigure_estimator : function, optional + A function that configures the estimator for the Model-X knockoff procedure. + + centered : bool, optional (default=True) + Whether to standardize the data before performing the inference procedure. + + cov_estimator : sklearn covariance estimator instance, optional + The method used to estimate the empirical covariance matrix. + + joblib_verbose : int, optional (default=0) + The verbosity level of the joblib parallel processing. + + n_bootstraps : int, optional (default=25) + The number of bootstrap samples to generate for aggregation. + + n_jobs : int, optional (default=1) + The number of jobs to run in parallel. + + random_state : int or None, optional (default=None) + The random seed used to generate the Gaussian knockoff variables. + + Returns + ------- + test_scores : 2D ndarray (n_bootstraps, n_features) + A matrix of test statistics for each bootstrap sample. + + Notes + ----- + This function generates multiple sets of Gaussian knockoff variables and calculates + the test statistics for each set using the `_stat_coefficient_diff` function. It + then aggregates the test statistics across the sets to improve stability and power. + """ assert n_bootstraps > 1, "the number of bootstraps should at least higher than 1" # unnecessary to have n_jobs > number of bootstraps n_jobs = min(n_bootstraps, n_jobs) @@ -240,7 +286,7 @@ def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): selection_only : bool, optional (default=True) Whether to return only the selected variables or additional information. If True, the function will return only the selected variables. If False, - the function will return the selected variables, the threshold, and the test scores. + the function will return the selected variables and the threshold. Returns ------- @@ -250,9 +296,6 @@ def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): threshold : float The knockoff threshold. - test_score : 1D array, (n_features, ) - A vector of test statistics. - Notes ----- This function calculates the knockoff threshold based on the test statistics and the @@ -274,6 +317,39 @@ def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, selection_only=True): """ This function implements the computation of the empirical p-values + + Parameters + ---------- + test_score : 1D array, (n_features, ) + A vector of test statistics. + + fdr : float, optional (default=0.1) + The desired controlled False Discovery Rate (FDR) level. + + fdr_control : str, optional (default="bhq") + The method used to control the False Discovery Rate. + + offset : int, 0 or 1, optional (default=1) + The offset to calculate the knockoff threshold. An offset of 1 is equivalent to + knockoff+. + + selection_only : bool, optional (default=True) + Whether to return only the selected variables or additional information. + If True, the function will return only the selected variables. If False, + the function will return the selected variables and the p-values. + + Returns + ------- + selected : 1D array, int + A vector of indices of the selected variables. + + pvals : 1D array, (n_features, ) + A vector of empirical p-values. + + Notes + ----- + This function calculates the empirical p-values based on the test statistics and the + desired FDR level. It then identifies the selected variables based on the p-values. """ pvals = _empirical_knockoff_pval(test_score, offset) threshold = fdr_threshold(pvals, fdr=fdr, method=fdr_control) @@ -286,49 +362,161 @@ def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, se def model_x_knockoff_bootstrap_e_value(test_scores, fdr=0.1, offset=1, selection_only=True): - n_bootstraps = len(test_scores) - evals = np.array( - [_empirical_knockoff_eval(test_scores[i], fdr / 2, offset) for i in range(n_bootstraps)] - ) + """ + This function implements the computation of the empirical e-values + from knockoff test and aggregates them using the e-BH procedure. - aggregated_eval = np.mean(evals, axis=0) - threshold = fdr_threshold(aggregated_eval, fdr=fdr, method="ebh") - selected = np.where(aggregated_eval >= threshold)[0] + Parameters + ---------- + test_scores : 2D array, (n_bootstraps, n_features) + A matrix of test statistics for each bootstrap sample. - if selection_only: - return selected - else: - return selected, aggregated_eval, evals + fdr : float, optional (default=0.1) + The desired controlled False Discovery Rate (FDR) level. + + offset : int, 0 or 1, optional (default=1) + The offset to calculate the knockoff threshold. An offset of 1 is equivalent to + knockoff+. + + selection_only : bool, optional (default=True) + Whether to return only the selected variables or additional information. + + Returns + ------- + selected : 1D array, int + A vector of indices of the selected variables. + + aggregated_eval : 1D array, (n_features, ) + A vector of aggregated empirical e-values. + + evals : 2D array, (n_bootstraps, n_features) + A matrix of empirical e-values for each bootstrap sample. + + Notes + ----- + This function calculates the empirical e-values based on the test statistics and the + desired FDR level. It then aggregates the e-values using the e-BH procedure and identifies + the selected variables based on the aggregated e-values. + """ + n_bootstraps = len(test_scores) + evals = np.array( + [_empirical_knockoff_eval(test_scores[i], fdr / 2, offset) for i in range(n_bootstraps)] + ) + + aggregated_eval = np.mean(evals, axis=0) + threshold = fdr_threshold(aggregated_eval, fdr=fdr, method="ebh") + selected = np.where(aggregated_eval >= threshold)[0] + + if selection_only: + return selected + else: + return selected, aggregated_eval, evals def model_x_knockoff_bootstrap_quantile(test_scores, fdr=0.1, fdr_control="bhq", reshaping_function=None, adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, offset=1, selection_only=True): - n_bootstraps = len(test_scores) - pvals = np.array( - [_empirical_knockoff_pval(test_scores[i], offset) for i in range(n_bootstraps)] - ) + """ + This function implements the computation of the empirical p-values + from knockoff test and aggregates them using the quantile aggregation procedure. - aggregated_pval = quantile_aggregation( - pvals, gamma=gamma, gamma_min=gamma_min, adaptive=adaptive_aggregation - ) + Parameters + ---------- + test_scores : 2D array, (n_bootstraps, n_features) + A matrix of test statistics for each bootstrap sample. - threshold = fdr_threshold( - aggregated_pval, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = np.where(aggregated_pval <= threshold)[0] + fdr : float, optional (default=0.1) + The desired controlled False Discovery Rate (FDR) level. - if selection_only: - return selected - else: - return selected, aggregated_pval, pvals + fdr_control : str, optional (default="bhq") + The method used to control the False Discovery Rate. + + reshaping_function : function, optional (default=None) + A function used to reshape the aggregated p-values before controlling the FDR. + + adaptive_aggregation : bool, optional (default=False) + Whether to use adaptive quantile aggregation. + + gamma : float, optional (default=0.5) + The quantile level for aggregation. + + gamma_min : float, optional (default=0.05) + The minimum quantile level for adaptive aggregation. + + offset : int, 0 or 1, optional (default=1) + The offset to calculate the knockoff threshold. An offset of 1 is equivalent to + knockoff+. + + selection_only : bool, optional (default=True) + Whether to return only the selected variables or additional information. + + Returns + ------- + selected : 1D array, int + A vector of indices of the selected variables. + + aggregated_pval : 1D array, (n_features, ) + A vector of aggregated empirical p-values. + + pvals : 2D array, (n_bootstraps, n_features) + A matrix of empirical p-values for each bootstrap sample. + + Notes + ----- + This function calculates the empirical p-values based on the test statistics and the + desired FDR level. It then aggregates the p-values using the quantile aggregation + procedure and identifies the selected variables based on the aggregated p-values. + """ + n_bootstraps = len(test_scores) + pvals = np.array( + [_empirical_knockoff_pval(test_scores[i], offset) for i in range(n_bootstraps)] + ) + + aggregated_pval = quantile_aggregation( + pvals, gamma=gamma, gamma_min=gamma_min, adaptive=adaptive_aggregation + ) + + threshold = fdr_threshold( + aggregated_pval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = np.where(aggregated_pval <= threshold)[0] + + if selection_only: + return selected + else: + return selected, aggregated_pval, pvals def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None): - # Compute statistic base on a cross validation - # original implementation: - # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/stats_glmnet_cv.R + """ + Compute statistic based on a cross-validation procedure. + + Original implementation: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/stats_glmnet_cv.R + + Parameters + ---------- + X : 2D ndarray (n_samples, n_features) + The original design matrix. + + X_tilde : 2D ndarray (n_samples, n_features) + The knockoff design matrix. + + y : 1D ndarray (n_samples, ) + The target vector. + + estimator : sklearn estimator instance or a cross-validation instance + The estimator used for fitting the data and computing the test statistics. + + preconfigure_estimator : function, optional + A function that configures the estimator for the Model-X knockoff procedure. + + Returns + ------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + """ n_samples, n_features = X.shape X_ko = np.column_stack([X, X_tilde]) if preconfigure_estimator is not None: @@ -344,27 +532,26 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None def _knockoff_threshold(test_score, fdr=0.1, offset=1): """ - Calculate the knockoff threshold based on the procedure stated in the - article. - - original code: + Calculate the knockoff threshold based on the procedure stated in the article. + + Original code: https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R Parameters ---------- test_score : 1D ndarray, shape (n_features, ) - vector of test statistic + Vector of test statistic. fdr : float, optional - desired controlled FDR(false discovery rate) level + Desired controlled FDR (false discovery rate) level. offset : int, 0 or 1, optional - offset equals 1 is the knockoff+ procedure + Offset equals 1 is the knockoff+ procedure. Returns ------- threshold : float or np.inf - threshold level + Threshold level. """ if offset not in (0, 1): raise ValueError("'offset' must be either 0 or 1") @@ -382,8 +569,20 @@ def _knockoff_threshold(test_score, fdr=0.1, offset=1): def _empirical_knockoff_pval(test_score, offset=1): """ - This function implements the computation of the empirical p-values - from knockoff test + Compute the empirical p-values from the knockoff test. + + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + + offset : int, 0 or 1, optional + Offset equals 1 is the knockoff+ procedure. + + Returns + ------- + pvals : 1D ndarray, shape (n_features, ) + Vector of empirical p-values. """ pvals = [] n_features = test_score.size @@ -405,8 +604,23 @@ def _empirical_knockoff_pval(test_score, offset=1): def _empirical_knockoff_eval(test_score, fdr=0.1, offset=1): """ - This function implements the computation of the empirical e-values - from knockoff test + Compute the empirical e-values from the knockoff test. + + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + + fdr : float, optional + Desired controlled FDR (false discovery rate) level. + + offset : int, 0 or 1, optional + Offset equals 1 is the knockoff+ procedure. + + Returns + ------- + evals : 1D ndarray, shape (n_features, ) + Vector of empirical e-values. """ evals = [] n_features = test_score.size From f752f9e06c8ba2a1bd1078d4983a53a65a1500a8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 18:48:31 +0100 Subject: [PATCH 20/48] /bin/bash: line 1: :wq: command not found --- doc_conf/api.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc_conf/api.rst b/doc_conf/api.rst index 885873f3..710ff857 100644 --- a/doc_conf/api.rst +++ b/doc_conf/api.rst @@ -23,8 +23,12 @@ Functions ensemble_clustered_inference group_reid hd_inference - knockoff_aggregation model_x_knockoff + model_x_knockoff_aggregation + model_x_knockoff_filter + model_x_knockoff_pvalue + model_x_knockoff_bootstrap_quantile + model_x_knockoff_bootstrap_e_value multivariate_1D_simulation permutation_test_cv reid From 071ea6d84bccea3eadcf33be1b8a993e83e4e948 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Jan 2025 18:49:03 +0100 Subject: [PATCH 21/48] Remove the begining of the file --- src/hidimstat/knockoffs.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index b759c40b..1c9dd2a5 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -1,11 +1,3 @@ -# -*- coding: utf-8 -*- -# Authors: Binh Nguyen -""" -Implementation of Model-X knockoffs inference procedure, introduced in -Candes et. al. (2016) " Panning for Gold: Model-X Knockoffs for -High-dimensional Controlled Variable Selection" - -""" import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.covariance import LedoitWolf From 68dd7e6ecc05d2fdeb5bf2cb13eb7ae762dd867e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Jan 2025 11:15:27 +0100 Subject: [PATCH 22/48] Add equations --- src/hidimstat/gaussian_knockoff.py | 3 + src/hidimstat/knockoffs.py | 107 ++++++++++++++++++++--------- 2 files changed, 76 insertions(+), 34 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 1ad3a436..7fef86b6 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -12,6 +12,9 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals the vector of empirical mean values mu, and the empirical covariance matrix sigma. It returns the knockoff variables X_tilde. + The original implementation can be found at + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_gaussian.R + Parameters ---------- X: 2D ndarray (n_samples, n_features) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 1c9dd2a5..1673198d 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -6,7 +6,10 @@ from sklearn.utils import check_random_state from joblib import Parallel, delayed -from hidimstat.gaussian_knockoff import gaussian_knockoff_generation, repeat_gaussian_knockoff_generation +from hidimstat.gaussian_knockoff import ( + gaussian_knockoff_generation, + repeat_gaussian_knockoff_generation, +) from hidimstat.utils import fdr_threshold, quantile_aggregation @@ -53,8 +56,8 @@ def preconfigure_estimator_LaccosCV(estimator, X, X_tilde, y, n_lambdas=10): This function is specifically designed for the Model-X knockoffs procedure, which combines original and knockoff variables in the design matrix. """ - if type(estimator).__name__ != 'LassoCV': - raise TypeError('You should not use this function for configure your estimator') + if type(estimator).__name__ != "LassoCV": + raise TypeError("You should not use this function for configure your estimator") n_features = X.shape[1] X_ko = np.column_stack([X, X_tilde]) @@ -66,10 +69,14 @@ def preconfigure_estimator_LaccosCV(estimator, X, X_tilde, y, n_lambdas=10): def model_x_knockoff( X, y, - estimator=LassoCV(n_jobs=None, verbose=0,max_iter=1000, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-8), - preconfigure_estimator = preconfigure_estimator_LaccosCV, + estimator=LassoCV( + n_jobs=None, + verbose=0, + max_iter=1000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-8, + ), + preconfigure_estimator=preconfigure_estimator_LaccosCV, centered=True, cov_estimator=LedoitWolf(assume_centered=True), seed=None, @@ -145,22 +152,26 @@ def model_x_knockoff( sigma = cov_estimator.fit(X).covariance_ # Create knockoff variables - X_tilde = gaussian_knockoff_generation( - X, mu, sigma, seed=seed + X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed) + + test_score = _stat_coefficient_diff( + X, X_tilde, y, estimator, preconfigure_estimator ) - - test_score = _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator) - return test_score + return test_score def model_x_knockoff_aggregation( X, y, - estimator=LassoCV(n_jobs=None, verbose=0,max_iter=1000, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-8), - preconfigure_estimator = preconfigure_estimator_LaccosCV, + estimator=LassoCV( + n_jobs=None, + verbose=0, + max_iter=1000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-8, + ), + preconfigure_estimator=preconfigure_estimator_LaccosCV, centered=True, cov_estimator=LedoitWolf(assume_centered=True), joblib_verbose=0, @@ -222,7 +233,7 @@ def model_x_knockoff_aggregation( # unnecessary to have n_jobs > number of bootstraps n_jobs = min(n_bootstraps, n_jobs) parallel = Parallel(n_jobs, verbose=joblib_verbose) - + # get the seed for the different run if isinstance(random_state, (int, np.int32, np.int64)): rng = check_random_state(random_state) @@ -231,7 +242,7 @@ def model_x_knockoff_aggregation( else: raise TypeError("Wrong type for random_state") seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) - + if centered: X = StandardScaler().fit_transform(X) @@ -240,9 +251,9 @@ def model_x_knockoff_aggregation( sigma = cov_estimator.fit(X).covariance_ # Create knockoff variables - X_tilde, (Mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( - X, mu, sigma, seed=seed_list[0], repeat=True - ) + X_tilde, (Mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( + X, mu, sigma, seed=seed_list[0], repeat=True + ) X_tildes = parallel( delayed(repeat_gaussian_knockoff_generation)( Mu_tilde, sigma_tilde_decompose, seed=seed @@ -250,14 +261,16 @@ def model_x_knockoff_aggregation( for seed in seed_list[1:] ) X_tildes.insert(0, X_tilde) - + test_scores = parallel( - delayed(_stat_coefficient_diff)(X, X_tildes[i], y, estimator, preconfigure_estimator) + delayed(_stat_coefficient_diff)( + X, X_tildes[i], y, estimator, preconfigure_estimator + ) for i in range(n_bootstraps) ) return test_scores - + def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): """ @@ -295,7 +308,7 @@ def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): """ if offset not in (0, 1): raise ValueError("'offset' must be either 0 or 1") - + # run the knockoff filter threshold = _knockoff_threshold(test_score, fdr=fdr, offset=offset) selected = np.where(test_score >= threshold)[0] @@ -306,7 +319,9 @@ def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): return selected, threshold -def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, selection_only=True): +def model_x_knockoff_pvalue( + test_score, fdr=0.1, fdr_control="bhq", offset=1, selection_only=True +): """ This function implements the computation of the empirical p-values @@ -353,7 +368,9 @@ def model_x_knockoff_pvalue(test_score, fdr=0.1, fdr_control="bhq", offset=1, se return selected, pvals -def model_x_knockoff_bootstrap_e_value(test_scores, fdr=0.1, offset=1, selection_only=True): +def model_x_knockoff_bootstrap_e_value( + test_scores, fdr=0.1, offset=1, selection_only=True +): """ This function implements the computation of the empirical e-values from knockoff test and aggregates them using the e-BH procedure. @@ -392,7 +409,10 @@ def model_x_knockoff_bootstrap_e_value(test_scores, fdr=0.1, offset=1, selection """ n_bootstraps = len(test_scores) evals = np.array( - [_empirical_knockoff_eval(test_scores[i], fdr / 2, offset) for i in range(n_bootstraps)] + [ + _empirical_knockoff_eval(test_scores[i], fdr / 2, offset) + for i in range(n_bootstraps) + ] ) aggregated_eval = np.mean(evals, axis=0) @@ -405,7 +425,17 @@ def model_x_knockoff_bootstrap_e_value(test_scores, fdr=0.1, offset=1, selection return selected, aggregated_eval, evals -def model_x_knockoff_bootstrap_quantile(test_scores, fdr=0.1, fdr_control="bhq", reshaping_function=None, adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, offset=1, selection_only=True): +def model_x_knockoff_bootstrap_quantile( + test_scores, + fdr=0.1, + fdr_control="bhq", + reshaping_function=None, + adaptive_aggregation=False, + gamma=0.5, + gamma_min=0.05, + offset=1, + selection_only=True, +): """ This function implements the computation of the empirical p-values from knockoff test and aggregates them using the quantile aggregation procedure. @@ -514,10 +544,15 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None if preconfigure_estimator is not None: preconfigure_estimator(estimator, X, X_tilde, y) estimator.fit(X_ko, y) - if hasattr(estimator, 'coef_'): + if hasattr(estimator, "coef_"): coef = np.ravel(estimator.coef_) - elif hasattr(estimator, 'best_estimator_') and hasattr(estimator.best_estimator_, 'coef_'): + elif hasattr(estimator, "best_estimator_") and hasattr( + estimator.best_estimator_, "coef_" + ): coef = np.ravel(estimator.best_estimator_.coef_) # for CV object + else: + raise ValueError("estimator should be linear") + # Equation 1.7 in Barber & Candes (2015) or 3.6 of Candes (2018) test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) return test_score @@ -549,9 +584,12 @@ def _knockoff_threshold(test_score, fdr=0.1, offset=1): raise ValueError("'offset' must be either 0 or 1") threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - np.concatenate([[0], threshold_mesh, [np.inf]]) # if there is no solution, the threshold is inf + np.concatenate( + [[0], threshold_mesh, [np.inf]] + ) # if there is no solution, the threshold is inf # find the right value of t for getting a good fdr - threshold = 0. + # Equation 1.8 of Barber & Candes (2015) and 3.10 in Candès 2018 + threshold = 0.0 for threshold in threshold_mesh: false_pos = np.sum(test_score <= -threshold) selected = np.sum(test_score >= threshold) @@ -559,6 +597,7 @@ def _knockoff_threshold(test_score, fdr=0.1, offset=1): break return threshold + def _empirical_knockoff_pval(test_score, offset=1): """ Compute the empirical p-values from the knockoff test. @@ -628,4 +667,4 @@ def _empirical_knockoff_eval(test_score, fdr=0.1, offset=1): else: evals.append(n_features / (offset + np.sum(test_score <= -ko_thr))) - return np.array(evals) \ No newline at end of file + return np.array(evals) From 727ee7d45f90ae3522bb31123bba69b680e0a8a7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Jan 2025 11:19:02 +0100 Subject: [PATCH 23/48] Change reference for paper --- src/hidimstat/gaussian_knockoff.py | 4 ++-- src/hidimstat/knockoffs.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 7fef86b6..381c3d45 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -55,7 +55,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals Sigma_inv_s = np.linalg.solve(sigma, Diag_s) - # First part on the RHS of equation 1.4 in Barber & Candes (2015) + # First part on the RHS of equation 1.4 in barber2015controlling Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) # To calculate the Cholesky decomposition later on sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) #TODO extra operation Sigma_inv_s.dot(Diag_s) ?????? @@ -67,7 +67,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals "definite. Adding minor positive value to the matrix.", UserWarning ) - # Equation 1.4 in Barber & Candes (2015) + # Equation 1.4 in barber2015controlling sigma_tilde_decompose = np.linalg.cholesky(sigma_tilde) X_tilde = Mu_tilde + np.dot(U_tilde, sigma_tilde_decompose) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 1673198d..f03dab4b 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -552,7 +552,7 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None coef = np.ravel(estimator.best_estimator_.coef_) # for CV object else: raise ValueError("estimator should be linear") - # Equation 1.7 in Barber & Candes (2015) or 3.6 of Candes (2018) + # Equation 1.7 in barber2015controlling or 3.6 of candes2018panning test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) return test_score @@ -588,7 +588,7 @@ def _knockoff_threshold(test_score, fdr=0.1, offset=1): [[0], threshold_mesh, [np.inf]] ) # if there is no solution, the threshold is inf # find the right value of t for getting a good fdr - # Equation 1.8 of Barber & Candes (2015) and 3.10 in Candès 2018 + # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 threshold = 0.0 for threshold in threshold_mesh: false_pos = np.sum(test_score <= -threshold) From efbe49a71387778d2b54264dece876e0e9b8742d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 11:27:13 +0100 Subject: [PATCH 24/48] add a reference --- doc_conf/references.bib | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 779af7f6..972da48a 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -152,7 +152,6 @@ @article{barber2015controlling publisher={JSTOR} } - @article{liuFastPowerfulConditional2021, title = {Fast and {{Powerful Conditional Randomization Testing}} via {{Distillation}}}, author = {Liu, Molei and Katsevich, Eugene and Janson, Lucas and Ramdas, Aaditya}, @@ -165,5 +164,13 @@ @article{liuFastPowerfulConditional2021 abstract = {We consider the problem of conditional independence testing: given a response Y and covariates (X,Z), we test the null hypothesis that Y is independent of X given Z. The conditional randomization test (CRT) was recently proposed as a way to use distributional information about X{\textbar}Z to exactly (non-asymptotically) control Type-I error using any test statistic in any dimensionality without assuming anything about Y{\textbar}(X,Z). This flexibility in principle allows one to derive powerful test statistics from complex prediction algorithms while maintaining statistical validity. Yet the direct use of such advanced test statistics in the CRT is prohibitively computationally expensive, especially with multiple testing, due to the CRT's requirement to recompute the test statistic many times on resampled data. We propose the distilled CRT, a novel approach to using state-of-the-art machine learning algorithms in the CRT while drastically reducing the number of times those algorithms need to be run, thereby taking advantage of their power and the CRT's statistical guarantees without suffering the usual computational expense. In addition to distillation, we propose a number of other tricks like screening and recycling computations to further speed up the CRT without sacrificing its high power and exact validity. Indeed, we show in simulations that all our proposals combined lead to a test that has similar power to the most powerful existing CRT implementations but requires orders of magnitude less computation, making it a practical tool even for large data sets. We demonstrate these benefits on a breast cancer dataset by identifying biomarkers related to cancer stage.}, archiveprefix = {arxiv}, keywords = {Statistics - Methodology}, - file = {/home/ahmad/Zotero/storage/8HRQZX3H/Liu et al. - 2021 - Fast and Powerful Conditional Randomization Testin.pdf;/home/ahmad/Zotero/storage/YFNDKN2B/2006.html} -} \ No newline at end of file +} + +@article{reid2016study, + title={A study of error variance estimation in lasso regression}, + author={Reid, Stephen and Tibshirani, Robert and Friedman, Jerome}, + journal={Statistica Sinica}, + pages={35--67}, + year={2016}, + publisher={JSTOR} +} From bfe111a665c423ac334f5a7b210262b0e86046c8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 11:27:25 +0100 Subject: [PATCH 25/48] a a new tests --- test/test_knockoff.py | 45 ++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 67747e01..affc041d 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -7,6 +7,9 @@ import numpy as np import pytest from sklearn.covariance import LedoitWolf, GraphicalLassoCV +from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import Lasso +from sklearn.model_selection import KFold @@ -84,7 +87,6 @@ def test_knockoff_aggregation(): ) def test_model_x_knockoff(): - seed = 42 fdr = 0.2 n = 300 @@ -93,6 +95,26 @@ def test_model_x_knockoff(): test_score = model_x_knockoff(X, y, seed=seed + 1) ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) fdp, power = cal_fdp_power(ko_result, non_zero) + assert fdp <= 0.2 + assert power > 0.7 + + ko_result = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=True) + fdp, power = cal_fdp_power(ko_result, non_zero) + assert fdp <= 0.2 + assert power > 0.7 + + +def test_model_x_knockoff_estimator(): + seed = 42 + fdr = 0.2 + n = 300 + p = 300 + X, y, _, non_zero = simu_data(n, p, seed=seed) + test_score = model_x_knockoff(X, y, estimator=GridSearchCV( + Lasso(), param_grid={'alpha':np.linspace(0.2, 0.3, 5)}), + preconfigure_estimator=None, seed=seed + 1) + ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 @@ -100,25 +122,22 @@ def test_model_x_knockoff(): def test_estimate_distribution(): """ - TODO replace the estimate distribution by calling knockoff function with them + test different estimation of the covariance """ SEED = 42 fdr = 0.1 n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=SEED) - mu = X.mean(axis=0) - Sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ - - - assert mu.size == p - assert Sigma.shape == (p, p) - - mu = X.mean(axis=0) - Sigma = GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X).covariance_ + test_score = model_x_knockoff(X,y, cov_estimator=LedoitWolf(assume_centered=True)) + ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + for i in ko_result: + assert np.any(i == non_zero) + test_score = model_x_knockoff(X,y, cov_estimator=GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1])) + ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + for i in ko_result: + assert np.any(i == non_zero) - assert mu.size == p - assert Sigma.shape == (p, p) def test_gaussian_knockoff_equi(): From 211599e0d2c996aebaf313b0c8da10ef9ebc9180 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 11:28:06 +0100 Subject: [PATCH 26/48] rename function adn remove warning for test --- src/hidimstat/gaussian_knockoff.py | 2 +- src/hidimstat/knockoffs.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 381c3d45..9c84c6d3 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -58,7 +58,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals # First part on the RHS of equation 1.4 in barber2015controlling Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) # To calculate the Cholesky decomposition later on - sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) #TODO extra operation Sigma_inv_s.dot(Diag_s) ?????? + sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s) # test is the matrix is positive definite while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): sigma_tilde += 1e-10 * np.eye(n_features) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index f03dab4b..ad49373e 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -13,7 +13,7 @@ from hidimstat.utils import fdr_threshold, quantile_aggregation -def preconfigure_estimator_LaccosCV(estimator, X, X_tilde, y, n_lambdas=10): +def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_lambdas=10): """ Configure the estimator for Model-X knockoffs. @@ -72,11 +72,11 @@ def model_x_knockoff( estimator=LassoCV( n_jobs=None, verbose=0, - max_iter=1000, + max_iter=200000, cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-8, + tol=1e-6, ), - preconfigure_estimator=preconfigure_estimator_LaccosCV, + preconfigure_estimator=preconfigure_estimator_LassoCV, centered=True, cov_estimator=LedoitWolf(assume_centered=True), seed=None, @@ -167,11 +167,11 @@ def model_x_knockoff_aggregation( estimator=LassoCV( n_jobs=None, verbose=0, - max_iter=1000, + max_iter=200000, cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-8, + tol=1e-6, ), - preconfigure_estimator=preconfigure_estimator_LaccosCV, + preconfigure_estimator=preconfigure_estimator_LassoCV, centered=True, cov_estimator=LedoitWolf(assume_centered=True), joblib_verbose=0, From beeda0a41981e52e2e347180f23cb0ce19560f46 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 11:35:08 +0100 Subject: [PATCH 27/48] Add parameters of knockoff --- src/hidimstat/knockoffs.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index ad49373e..04f5ac15 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -80,6 +80,7 @@ def model_x_knockoff( centered=True, cov_estimator=LedoitWolf(assume_centered=True), seed=None, + tol_gauss=1e-14 ): """ Model-X Knockoff @@ -127,6 +128,9 @@ def model_x_knockoff( seed : int or None, optional (default=None) The random seed used to generate the Gaussian knockoff variables. + + tol_gauss : float, optional (default=1e-14) + A tolerance value used for numerical stability in the calculation of the Cholesky decomposition in the gaussian generation function. Returns ------- @@ -152,7 +156,7 @@ def model_x_knockoff( sigma = cov_estimator.fit(X).covariance_ # Create knockoff variables - X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed) + X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed, tol=tol_gauss) test_score = _stat_coefficient_diff( X, X_tilde, y, estimator, preconfigure_estimator @@ -178,6 +182,7 @@ def model_x_knockoff_aggregation( n_bootstraps=25, n_jobs=1, random_state=None, + tol_gauss=1e-14 ): """ Model-X Knockoff Aggregation @@ -217,6 +222,9 @@ def model_x_knockoff_aggregation( random_state : int or None, optional (default=None) The random seed used to generate the Gaussian knockoff variables. + + tol_gauss : float, optional (default=1e-14) + A tolerance value used for numerical stability in the calculation of the Cholesky decomposition in the gaussian generation function. Returns ------- @@ -252,11 +260,11 @@ def model_x_knockoff_aggregation( # Create knockoff variables X_tilde, (Mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( - X, mu, sigma, seed=seed_list[0], repeat=True + X, mu, sigma, seed=seed_list[0], tol= tol_gauss, repeat=True ) X_tildes = parallel( delayed(repeat_gaussian_knockoff_generation)( - Mu_tilde, sigma_tilde_decompose, seed=seed + Mu_tilde, sigma_tilde_decompose, seed=seed, ) for seed in seed_list[1:] ) From d066c5a19ded8fb392ba2a04aa6a3beb7df5a168 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 11:49:56 +0100 Subject: [PATCH 28/48] Format files --- src/hidimstat/__init__.py | 9 +++- src/hidimstat/gaussian_knockoff.py | 21 ++++---- src/hidimstat/knockoffs.py | 18 ++++--- test/test_knockoff.py | 85 ++++++++++++++++++++---------- 4 files changed, 85 insertions(+), 48 deletions(-) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index e1ca8c6c..a6a78165 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -3,7 +3,14 @@ from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso from .Dnn_learner_single import DnnLearnerSingle from .ensemble_clustered_inference import ensemble_clustered_inference -from .knockoffs import model_x_knockoff, model_x_knockoff_aggregation, model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_quantile, model_x_knockoff_bootstrap_e_value +from .knockoffs import ( + model_x_knockoff, + model_x_knockoff_aggregation, + model_x_knockoff_filter, + model_x_knockoff_pvalue, + model_x_knockoff_bootstrap_quantile, + model_x_knockoff_bootstrap_e_value +) from .multi_sample_split import aggregate_quantiles from .noise_std import group_reid, reid from .permutation_test import permutation_test_cv diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 9c84c6d3..8b97f28b 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -12,7 +12,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals the vector of empirical mean values mu, and the empirical covariance matrix sigma. It returns the knockoff variables X_tilde. - The original implementation can be found at + The original implementation can be found at https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_gaussian.R Parameters @@ -60,11 +60,12 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals # To calculate the Cholesky decomposition later on sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s) # test is the matrix is positive definite - while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): + while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): sigma_tilde += 1e-10 * np.eye(n_features) warnings.warn( "The conditional covariance matrix for knockoffs is not positive " - "definite. Adding minor positive value to the matrix.", UserWarning + "definite. Adding minor positive value to the matrix.", + UserWarning, ) # Equation 1.4 in barber2015controlling @@ -104,14 +105,13 @@ def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): The knockoff variables. """ n_samples, n_features = Mu_tilde.shape - + # create a uniform noise for all the data rng = np.random.RandomState(seed) U_tilde = rng.randn(n_samples, n_features) X_tilde = Mu_tilde + np.dot(U_tilde, sigma_tilde_decompose) return X_tilde - def _s_equi(sigma, tol=1e-14): @@ -120,7 +120,7 @@ def _s_equi(sigma, tol=1e-14): This function estimates the diagonal matrix of correlation between real and knockoff variables using the equi-correlated equation described in :cite:`barber2015controlling` and - :cite:`candes2018panning`. It takes as input the empirical covariance matrix sigma and a tolerance value tol, + :cite:`candes2018panning`. It takes as input the empirical covariance matrix sigma and a tolerance value tol, and returns a vector of diagonal values of the estimated matrix diag{s}. Parameters @@ -153,10 +153,10 @@ def _s_equi(sigma, tol=1e-14): lambda_min = np.min(eig_value[0]) # check if the matrix is positive-defined if lambda_min < 0: - raise Exception('The covariance matrix is not positive-definite.') + raise Exception("The covariance matrix is not positive-definite.") S = np.ones(n_features) * min(2 * lambda_min, 1) - + psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S)) > tol) s_eps = 0 while psd is False: @@ -165,9 +165,10 @@ def _s_equi(sigma, tol=1e-14): s_eps = np.eps s_eps *= 10 # if all eigval > 0 then the matrix is positive define - psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S * (1 - s_eps))) > tol) + psd = np.all( + np.linalg.eigvalsh(2 * corr_matrix - np.diag(S * (1 - s_eps))) > tol + ) S = S * (1 - s_eps) return S * np.diag(sigma) - diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 04f5ac15..1f45433c 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -80,7 +80,7 @@ def model_x_knockoff( centered=True, cov_estimator=LedoitWolf(assume_centered=True), seed=None, - tol_gauss=1e-14 + tol_gauss=1e-14, ): """ Model-X Knockoff @@ -128,7 +128,7 @@ def model_x_knockoff( seed : int or None, optional (default=None) The random seed used to generate the Gaussian knockoff variables. - + tol_gauss : float, optional (default=1e-14) A tolerance value used for numerical stability in the calculation of the Cholesky decomposition in the gaussian generation function. @@ -156,7 +156,7 @@ def model_x_knockoff( sigma = cov_estimator.fit(X).covariance_ # Create knockoff variables - X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed, tol=tol_gauss) + X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed, tol=tol_gauss) test_score = _stat_coefficient_diff( X, X_tilde, y, estimator, preconfigure_estimator @@ -182,7 +182,7 @@ def model_x_knockoff_aggregation( n_bootstraps=25, n_jobs=1, random_state=None, - tol_gauss=1e-14 + tol_gauss=1e-14, ): """ Model-X Knockoff Aggregation @@ -222,7 +222,7 @@ def model_x_knockoff_aggregation( random_state : int or None, optional (default=None) The random seed used to generate the Gaussian knockoff variables. - + tol_gauss : float, optional (default=1e-14) A tolerance value used for numerical stability in the calculation of the Cholesky decomposition in the gaussian generation function. @@ -260,11 +260,13 @@ def model_x_knockoff_aggregation( # Create knockoff variables X_tilde, (Mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( - X, mu, sigma, seed=seed_list[0], tol= tol_gauss, repeat=True + X, mu, sigma, seed=seed_list[0], tol=tol_gauss, repeat=True ) X_tildes = parallel( delayed(repeat_gaussian_knockoff_generation)( - Mu_tilde, sigma_tilde_decompose, seed=seed, + Mu_tilde, + sigma_tilde_decompose, + seed=seed, ) for seed in seed_list[1:] ) @@ -596,7 +598,7 @@ def _knockoff_threshold(test_score, fdr=0.1, offset=1): [[0], threshold_mesh, [np.inf]] ) # if there is no solution, the threshold is inf # find the right value of t for getting a good fdr - # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 + # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 threshold = 0.0 for threshold in threshold_mesh: false_pos = np.sum(test_score <= -threshold) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index affc041d..2a9066d4 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -1,6 +1,11 @@ -from hidimstat.knockoffs import model_x_knockoff_aggregation, model_x_knockoff,\ - model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_e_value,\ - model_x_knockoff_bootstrap_quantile +from hidimstat.knockoffs import ( + model_x_knockoff_aggregation, + model_x_knockoff, + model_x_knockoff_filter, + model_x_knockoff_pvalue, + model_x_knockoff_bootstrap_e_value, + model_x_knockoff_bootstrap_quantile, +) from hidimstat.gaussian_knockoff import gaussian_knockoff_generation from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power @@ -9,9 +14,6 @@ from sklearn.covariance import LedoitWolf, GraphicalLassoCV from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Lasso -from sklearn.model_selection import KFold - - def test_knockoff_aggregation(): @@ -21,15 +23,19 @@ def test_knockoff_aggregation(): n_bootstraps = 25 fdr = 0.5 X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) - + test_scores = model_x_knockoff_aggregation( X, y, n_bootstraps=n_bootstraps, random_state=0 ) - selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile(test_scores, fdr=fdr, selection_only=False) - + selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile( + test_scores, fdr=fdr, selection_only=False + ) + fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) - selected_no_verbose = model_x_knockoff_bootstrap_quantile(test_scores, fdr=fdr, selection_only=True) + selected_no_verbose = model_x_knockoff_bootstrap_quantile( + test_scores, fdr=fdr, selection_only=True + ) fdp_no_verbose, power_no_verbose = cal_fdp_power( selected_no_verbose, non_zero_index @@ -43,18 +49,22 @@ def test_knockoff_aggregation(): np.testing.assert_array_equal(selected_no_verbose, selected_verbose) # Single AKO (or vanilla KO) (verbose vs no verbose) - test_bootstrap = model_x_knockoff_aggregation( - X, y, n_bootstraps=2, random_state=5 + test_bootstrap = model_x_knockoff_aggregation(X, y, n_bootstraps=2, random_state=5) + test_no_bootstrap = model_x_knockoff( + X, y, seed=np.random.RandomState(5).randint(1, np.iinfo(np.int32).max, 2)[0] ) - test_no_bootstrap = model_x_knockoff(X, y, seed=np.random.RandomState(5).randint(1, np.iinfo(np.int32).max, 2)[0]) np.testing.assert_array_equal(test_bootstrap[0], test_no_bootstrap) # Using e-values aggregation (verbose vs no verbose) - selected_verbose, aggregated_eval, evals = model_x_knockoff_bootstrap_e_value(test_scores, fdr=fdr, selection_only=False) + selected_verbose, aggregated_eval, evals = model_x_knockoff_bootstrap_e_value( + test_scores, fdr=fdr, selection_only=False + ) fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) - selected_no_verbose = model_x_knockoff_bootstrap_e_value(test_scores, fdr=fdr, selection_only=True) + selected_no_verbose = model_x_knockoff_bootstrap_e_value( + test_scores, fdr=fdr, selection_only=True + ) fdp_no_verbose, power_no_verbose = cal_fdp_power( selected_no_verbose, non_zero_index ) @@ -86,6 +96,7 @@ def test_knockoff_aggregation(): offset=2, ) + def test_model_x_knockoff(): seed = 42 fdr = 0.2 @@ -93,11 +104,14 @@ def test_model_x_knockoff(): p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) test_score = model_x_knockoff(X, y, seed=seed + 1) - ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + ko_result = model_x_knockoff_filter( + test_score, + fdr=fdr, + ) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 - + ko_result = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=True) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 @@ -110,10 +124,17 @@ def test_model_x_knockoff_estimator(): n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) - test_score = model_x_knockoff(X, y, estimator=GridSearchCV( - Lasso(), param_grid={'alpha':np.linspace(0.2, 0.3, 5)}), - preconfigure_estimator=None, seed=seed + 1) - ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + test_score = model_x_knockoff( + X, + y, + estimator=GridSearchCV(Lasso(), param_grid={"alpha": np.linspace(0.2, 0.3, 5)}), + preconfigure_estimator=None, + seed=seed + 1, + ) + ko_result = model_x_knockoff_filter( + test_score, + fdr=fdr, + ) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 @@ -129,15 +150,22 @@ def test_estimate_distribution(): n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=SEED) - test_score = model_x_knockoff(X,y, cov_estimator=LedoitWolf(assume_centered=True)) - ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + test_score = model_x_knockoff(X, y, cov_estimator=LedoitWolf(assume_centered=True)) + ko_result = model_x_knockoff_filter( + test_score, + fdr=fdr, + ) for i in ko_result: - assert np.any(i == non_zero) - test_score = model_x_knockoff(X,y, cov_estimator=GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1])) - ko_result = model_x_knockoff_filter(test_score, fdr=fdr,) + assert np.any(i == non_zero) + test_score = model_x_knockoff( + X, y, cov_estimator=GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) + ) + ko_result = model_x_knockoff_filter( + test_score, + fdr=fdr, + ) for i in ko_result: - assert np.any(i == non_zero) - + assert np.any(i == non_zero) def test_gaussian_knockoff_equi(): @@ -152,4 +180,3 @@ def test_gaussian_knockoff_equi(): X_tilde = gaussian_knockoff_generation(X, mu, Sigma, seed=SEED * 2) assert X_tilde.shape == (n, p) - From f6c699ff0d1b575ed8f3c719d98707c4cbf47457 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 11:51:20 +0100 Subject: [PATCH 29/48] format file --- src/hidimstat/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index a6a78165..5edfcbd1 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -9,7 +9,7 @@ model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_quantile, - model_x_knockoff_bootstrap_e_value + model_x_knockoff_bootstrap_e_value, ) from .multi_sample_split import aggregate_quantiles from .noise_std import group_reid, reid From bf621d2d1aea7e4b05d35a477377206d52323f39 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 17:49:59 +0100 Subject: [PATCH 30/48] Fix bugs --- examples/plot_knockoff_aggregation.py | 26 ++++++++++++-------------- test/test_knockoff.py | 4 +++- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index a690480c..a0e70077 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -24,8 +24,12 @@ import numpy as np from hidimstat.data_simulation import simu_data -from hidimstat.knockoffs import model_x_knockoff -from hidimstat.knockoff_aggregation import knockoff_aggregation +from hidimstat.knockoffs import ( + model_x_knockoff, + model_x_knockoff_aggregation, + model_x_knockoff_bootstrap_quantile, + model_x_knockoff_bootstrap_e_value, +) from hidimstat.utils import cal_fdp_power from sklearn.utils import check_random_state import matplotlib.pyplot as plt @@ -65,28 +69,22 @@ def single_run( fdp_mx, power_mx = cal_fdp_power(mx_selection, non_zero_index) # Use p-values aggregation [2] - aggregated_ko_selection = knockoff_aggregation( + test_scores = model_x_knockoff_aggregation( X, y, - fdr=fdr, n_bootstraps=n_bootstraps, n_jobs=n_jobs, - gamma=0.3, random_state=seed, ) + aggregated_ko_selection = model_x_knockoff_bootstrap_quantile( + test_scores, fdr=fdr, gamma=0.3, selection_only=True + ) fdp_pval, power_pval = cal_fdp_power(aggregated_ko_selection, non_zero_index) # Use e-values aggregation [1] - eval_selection = knockoff_aggregation( - X, - y, - fdr=fdr, - method="e-values", - n_bootstraps=n_bootstraps, - n_jobs=n_jobs, - gamma=0.3, - random_state=seed, + eval_selection = model_x_knockoff_bootstrap_e_value( + test_scores, fdr=fdr, selection_only=True ) fdp_eval, power_eval = cal_fdp_power(eval_selection, non_zero_index) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 2a9066d4..3b900f5c 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -14,6 +14,7 @@ from sklearn.covariance import LedoitWolf, GraphicalLassoCV from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Lasso +from sklearn.model_selection import KFold def test_knockoff_aggregation(): @@ -158,7 +159,8 @@ def test_estimate_distribution(): for i in ko_result: assert np.any(i == non_zero) test_score = model_x_knockoff( - X, y, cov_estimator=GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1]) + X, y, cov_estimator=GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1], + cv=KFold(n_splits=5, shuffle=True, random_state=0)) ) ko_result = model_x_knockoff_filter( test_score, From 779288bcd42a74fdf783ba7d8cd51f20fa86f4d2 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 18:31:49 +0100 Subject: [PATCH 31/48] Fix bug in utils --- examples/plot_knockoff_aggregation.py | 26 ++++++++++++++++++++++++-- src/hidimstat/utils.py | 2 +- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index a0e70077..f43598fb 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -26,12 +26,15 @@ from hidimstat.data_simulation import simu_data from hidimstat.knockoffs import ( model_x_knockoff, + model_x_knockoff_filter, model_x_knockoff_aggregation, model_x_knockoff_bootstrap_quantile, model_x_knockoff_bootstrap_e_value, ) from hidimstat.utils import cal_fdp_power from sklearn.utils import check_random_state +from sklearn.linear_model import LassoCV +from sklearn.model_selection import KFold import matplotlib.pyplot as plt plt.rcParams.update({"font.size": 26}) @@ -65,13 +68,32 @@ def single_run( ) # Use model-X Knockoffs [1] - mx_selection = model_x_knockoff(X, y, fdr=fdr, n_jobs=n_jobs, seed=seed) - + test_scores = model_x_knockoff( + X, + y, + estimator=LassoCV( + n_jobs=n_jobs, + verbose=0, + max_iter=200000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-6, + ), + seed=seed, + ) + mx_selection = model_x_knockoff_filter(test_scores, fdr=fdr) fdp_mx, power_mx = cal_fdp_power(mx_selection, non_zero_index) + # Use p-values aggregation [2] test_scores = model_x_knockoff_aggregation( X, y, + estimator=LassoCV( + n_jobs=n_jobs, + verbose=0, + max_iter=200000, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-6, + ), n_bootstraps=n_bootstraps, n_jobs=n_jobs, random_state=seed, diff --git a/src/hidimstat/utils.py b/src/hidimstat/utils.py index efdc9336..6afdc1a4 100644 --- a/src/hidimstat/utils.py +++ b/src/hidimstat/utils.py @@ -96,7 +96,7 @@ def _ebh_threshold(evals, fdr=0.1): if selected_index <= n_features: return evals_sorted[selected_index] else: - return np.infty + return np.inf def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): From 20d05676b7dda6ec55e4190bf40125e4fdbd2486 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 18:41:51 +0100 Subject: [PATCH 32/48] Update example knockoff --- examples/plot_knockoff_aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index f43598fb..a634fc2b 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -74,7 +74,7 @@ def single_run( estimator=LassoCV( n_jobs=n_jobs, verbose=0, - max_iter=200000, + max_iter=1000, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-6, ), @@ -90,7 +90,7 @@ def single_run( estimator=LassoCV( n_jobs=n_jobs, verbose=0, - max_iter=200000, + max_iter =1000, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-6, ), From c71cc22c42ff33f1846fc92303ce9e524d096de9 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 18:43:02 +0100 Subject: [PATCH 33/48] Formating --- examples/plot_knockoff_aggregation.py | 2 +- test/test_knockoff.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index a634fc2b..d8d3fda9 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -90,7 +90,7 @@ def single_run( estimator=LassoCV( n_jobs=n_jobs, verbose=0, - max_iter =1000, + max_iter=1000, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-6, ), diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 3b900f5c..af856a81 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -159,8 +159,12 @@ def test_estimate_distribution(): for i in ko_result: assert np.any(i == non_zero) test_score = model_x_knockoff( - X, y, cov_estimator=GraphicalLassoCV(alphas=[1e-3, 1e-2, 1e-1, 1], - cv=KFold(n_splits=5, shuffle=True, random_state=0)) + X, + y, + cov_estimator=GraphicalLassoCV( + alphas=[1e-3, 1e-2, 1e-1, 1], + cv=KFold(n_splits=5, shuffle=True, random_state=0), + ), ) ko_result = model_x_knockoff_filter( test_score, From bbd32528d68d9dd64b426bbd2233bcc89a4627ec Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Jan 2025 18:49:27 +0100 Subject: [PATCH 34/48] Update function --- src/hidimstat/gaussian_knockoff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 8b97f28b..7fb7128a 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -162,7 +162,7 @@ def _s_equi(sigma, tol=1e-14): while psd is False: if s_eps == 0: s_eps = np.eps # avoid zeros - s_eps = np.eps + else: s_eps *= 10 # if all eigval > 0 then the matrix is positive define psd = np.all( From 9af787f8fc9287b7d9dee8edc279470bc3717fca Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 16 Jan 2025 10:06:25 +0100 Subject: [PATCH 35/48] Apply suggestions from code review Small modification Co-authored-by: bthirion --- src/hidimstat/gaussian_knockoff.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index 7fb7128a..df0af8ef 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -29,10 +29,10 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals seed : int, optional A random seed for generating the uniform noise used to create the knockoff variables. - tol : float, optional + tol : float, default=1.e-14 A tolerance value used for numerical stability in the calculation of the Cholesky decomposition. - repeat : bool, optional + repeat : bool, default=False If True, the function returns the values used to generate the knockoff variables (Mu_tilde and sigma_tilde_decompose), which can be used to generate additional knockoff variables without having to recompute them. @@ -152,7 +152,7 @@ def _s_equi(sigma, tol=1e-14): eig_value = np.linalg.eigvalsh(corr_matrix) lambda_min = np.min(eig_value[0]) # check if the matrix is positive-defined - if lambda_min < 0: + if lambda_min <= 0: raise Exception("The covariance matrix is not positive-definite.") S = np.ones(n_features) * min(2 * lambda_min, 1) From 548e28324fbc39be45148c9df76ebf6d75dcf6c8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Jan 2025 10:18:20 +0100 Subject: [PATCH 36/48] Fix name variables --- src/hidimstat/gaussian_knockoff.py | 38 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index df0af8ef..b351fae7 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -33,7 +33,7 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals A tolerance value used for numerical stability in the calculation of the Cholesky decomposition. repeat : bool, default=False - If True, the function returns the values used to generate the knockoff variables (Mu_tilde and sigma_tilde_decompose), + If True, the function returns the values used to generate the knockoff variables (mu_tilde and sigma_tilde_decompose), which can be used to generate additional knockoff variables without having to recompute them. Returns @@ -49,16 +49,16 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals # create a uniform noise for all the data rng = np.random.RandomState(seed) - U_tilde = rng.randn(n_samples, n_features) + u_tilde = rng.randn(n_samples, n_features) - Diag_s = np.diag(_s_equi(sigma, tol=tol)) + diag_s = np.diag(_s_equi(sigma, tol=tol)) - Sigma_inv_s = np.linalg.solve(sigma, Diag_s) + sigma_inv_s = np.linalg.solve(sigma, diag_s) # First part on the RHS of equation 1.4 in barber2015controlling - Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) + mu_tilde = X - np.dot(X - mu, sigma_inv_s) # To calculate the Cholesky decomposition later on - sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s) + sigma_tilde = 2 * diag_s - diag_s.dot(sigma_inv_s) # test is the matrix is positive definite while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): sigma_tilde += 1e-10 * np.eye(n_features) @@ -70,26 +70,26 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals # Equation 1.4 in barber2015controlling sigma_tilde_decompose = np.linalg.cholesky(sigma_tilde) - X_tilde = Mu_tilde + np.dot(U_tilde, sigma_tilde_decompose) + X_tilde = mu_tilde + np.dot(u_tilde, sigma_tilde_decompose) if not repeat: return X_tilde else: - return X_tilde, (Mu_tilde, sigma_tilde_decompose) + return X_tilde, (mu_tilde, sigma_tilde_decompose) -def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): +def repeat_gaussian_knockoff_generation(mu_tilde, sigma_tilde_decompose, seed): """ Generate additional knockoff variables using pre-computed values. This function generates additional knockoff variables using pre-computed values returned by the - gaussian_knockoff_generation function with repeat=True. It takes as input Mu_tilde and + gaussian_knockoff_generation function with repeat=True. It takes as input mu_tilde and sigma_tilde_decompose, which were returned by gaussian_knockoff_generation, and a random seed. It returns the new knockoff variables X_tilde. Parameters ---------- - Mu_tilde : 2D ndarray (n_samples, n_features) + mu_tilde : 2D ndarray (n_samples, n_features) The matrix of means used to generate the knockoff variables, returned by gaussian_knockoff_generation. sigma_tilde_decompose : 2D ndarray (n_features, n_features) @@ -104,13 +104,13 @@ def repeat_gaussian_knockoff_generation(Mu_tilde, sigma_tilde_decompose, seed): X_tilde : 2D ndarray (n_samples, n_features) The knockoff variables. """ - n_samples, n_features = Mu_tilde.shape + n_samples, n_features = mu_tilde.shape # create a uniform noise for all the data rng = np.random.RandomState(seed) - U_tilde = rng.randn(n_samples, n_features) + u_tilde = rng.randn(n_samples, n_features) - X_tilde = Mu_tilde + np.dot(U_tilde, sigma_tilde_decompose) + X_tilde = mu_tilde + np.dot(u_tilde, sigma_tilde_decompose) return X_tilde @@ -155,9 +155,9 @@ def _s_equi(sigma, tol=1e-14): if lambda_min <= 0: raise Exception("The covariance matrix is not positive-definite.") - S = np.ones(n_features) * min(2 * lambda_min, 1) + s = np.ones(n_features) * min(2 * lambda_min, 1) - psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(S)) > tol) + psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(s)) > tol) s_eps = 0 while psd is False: if s_eps == 0: @@ -166,9 +166,9 @@ def _s_equi(sigma, tol=1e-14): s_eps *= 10 # if all eigval > 0 then the matrix is positive define psd = np.all( - np.linalg.eigvalsh(2 * corr_matrix - np.diag(S * (1 - s_eps))) > tol + np.linalg.eigvalsh(2 * corr_matrix - np.diag(s * (1 - s_eps))) > tol ) - S = S * (1 - s_eps) + s = s * (1 - s_eps) - return S * np.diag(sigma) + return s * np.diag(sigma) From 0ad893e186352992ae14ad5b7c275ad4142d9f2c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Jan 2025 10:16:03 +0100 Subject: [PATCH 37/48] Fix name variables --- src/hidimstat/knockoffs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 1f45433c..0c9d190b 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -259,12 +259,12 @@ def model_x_knockoff_aggregation( sigma = cov_estimator.fit(X).covariance_ # Create knockoff variables - X_tilde, (Mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( + X_tilde, (mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( X, mu, sigma, seed=seed_list[0], tol=tol_gauss, repeat=True ) X_tildes = parallel( delayed(repeat_gaussian_knockoff_generation)( - Mu_tilde, + mu_tilde, sigma_tilde_decompose, seed=seed, ) From 252a012715842553404d01c25dd4dcc3423e0716 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Jan 2025 10:16:22 +0100 Subject: [PATCH 38/48] Fix test and name variables --- test/test_knockoff.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index af856a81..45d8fd8f 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -146,12 +146,13 @@ def test_estimate_distribution(): """ test different estimation of the covariance """ - SEED = 42 + seed = 42 fdr = 0.1 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) - test_score = model_x_knockoff(X, y, cov_estimator=LedoitWolf(assume_centered=True)) + X, y, _, non_zero = simu_data(n, p, seed=seed) + test_score = model_x_knockoff(X, y, cov_estimator=LedoitWolf(assume_centered=True), + seed=seed+1) ko_result = model_x_knockoff_filter( test_score, fdr=fdr, @@ -165,6 +166,7 @@ def test_estimate_distribution(): alphas=[1e-3, 1e-2, 1e-1, 1], cv=KFold(n_splits=5, shuffle=True, random_state=0), ), + seed=seed+2 ) ko_result = model_x_knockoff_filter( test_score, @@ -175,14 +177,13 @@ def test_estimate_distribution(): def test_gaussian_knockoff_equi(): - SEED = 42 - fdr = 0.1 + seed = 42 n = 100 p = 50 - X, y, _, non_zero = simu_data(n, p, seed=SEED) + X, y, _, non_zero = simu_data(n, p, seed=seed) mu = X.mean(axis=0) Sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ - X_tilde = gaussian_knockoff_generation(X, mu, Sigma, seed=SEED * 2) + X_tilde = gaussian_knockoff_generation(X, mu, Sigma, seed=seed * 2) assert X_tilde.shape == (n, p) From b34cf1fc15a29a9f4b75dec7a4b9c7cc2e0bf16a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Jan 2025 11:31:29 +0100 Subject: [PATCH 39/48] Add tests and fix bugs --- src/hidimstat/gaussian_knockoff.py | 9 +++++++-- test/test_knockoff.py | 27 ++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index b351fae7..ea17917a 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -159,15 +159,20 @@ def _s_equi(sigma, tol=1e-14): psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(s)) > tol) s_eps = 0 - while psd is False: + while not psd: if s_eps == 0: - s_eps = np.eps # avoid zeros + s_eps = np.finfo(type(s[0])).eps # avoid zeros else: s_eps *= 10 # if all eigval > 0 then the matrix is positive define psd = np.all( np.linalg.eigvalsh(2 * corr_matrix - np.diag(s * (1 - s_eps))) > tol ) + warnings.warn( + "The equi-correlated matrix for knockoffs is not positive " + "definite. Reduce the value of distance.", + UserWarning, + ) s = s * (1 - s_eps) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 45d8fd8f..9f0e3db2 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -6,7 +6,7 @@ model_x_knockoff_bootstrap_e_value, model_x_knockoff_bootstrap_quantile, ) -from hidimstat.gaussian_knockoff import gaussian_knockoff_generation +from hidimstat.gaussian_knockoff import gaussian_knockoff_generation, _s_equi from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power import numpy as np @@ -187,3 +187,28 @@ def test_gaussian_knockoff_equi(): X_tilde = gaussian_knockoff_generation(X, mu, Sigma, seed=seed * 2) assert X_tilde.shape == (n, p) + + +def test_s_equi_not_define_positive(): + """test the warning and error of s_equi function""" + n= 10 + tol = 1e-7 + np.random.seed(42) + + # random matrix + sigma = np.random.randn(n,n) + sigma -= np.min(sigma) + with pytest.raises(Exception, match='The covariance matrix is not positive-definite.'): + _s_equi(sigma) + + # positive matrix + while not np.all(np.linalg.eigvalsh(sigma) > tol): + sigma += 0.1 * np.eye(n) + print(np.linalg.eigvalsh(sigma)) + with pytest.warns(UserWarning, match='The equi-correlated matrix'): + _s_equi(sigma) + + # positive definite matrix + sigma = sigma.T*sigma + sigma = (sigma + sigma.T)/2 + _s_equi(sigma) \ No newline at end of file From 2ddfd9d25568ac8662eaacb93be7f8b8b1ea1a25 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Jan 2025 12:00:45 +0100 Subject: [PATCH 40/48] Add a tests and formating file --- test/test_knockoff.py | 55 ++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 9f0e3db2..13df16b1 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -151,8 +151,9 @@ def test_estimate_distribution(): n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=seed) - test_score = model_x_knockoff(X, y, cov_estimator=LedoitWolf(assume_centered=True), - seed=seed+1) + test_score = model_x_knockoff( + X, y, cov_estimator=LedoitWolf(assume_centered=True), seed=seed + 1 + ) ko_result = model_x_knockoff_filter( test_score, fdr=fdr, @@ -166,7 +167,7 @@ def test_estimate_distribution(): alphas=[1e-3, 1e-2, 1e-1, 1], cv=KFold(n_splits=5, shuffle=True, random_state=0), ), - seed=seed+2 + seed=seed + 2, ) ko_result = model_x_knockoff_filter( test_score, @@ -182,33 +183,59 @@ def test_gaussian_knockoff_equi(): p = 50 X, y, _, non_zero = simu_data(n, p, seed=seed) mu = X.mean(axis=0) - Sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ + sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ - X_tilde = gaussian_knockoff_generation(X, mu, Sigma, seed=seed * 2) + X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed * 2) + + assert X_tilde.shape == (n, p) + + +def test_gaussian_knockoff_equi_warning(): + "test warning in guassian knockoff" + seed = 42 + n = 100 + p = 50 + tol = 1e-7 + rgn = np.random.RandomState(seed) + X = rgn.randn(n, p) + mu = X.mean(axis=0) + # create a positive definite matrix + sigma = rgn.randn(p, p) + while not np.all(np.linalg.eigvalsh(sigma) > tol): + sigma += 0.1 * np.eye(p) + sigma = sigma.T * sigma + sigma *= 1e-13 + with pytest.warns( + UserWarning, + match="The conditional covariance matrix for knockoffs is not positive", + ): + X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed * 2, tol=1e-10) assert X_tilde.shape == (n, p) def test_s_equi_not_define_positive(): """test the warning and error of s_equi function""" - n= 10 + n = 10 tol = 1e-7 - np.random.seed(42) + seed = 42 # random matrix - sigma = np.random.randn(n,n) + rgn = np.random.RandomState(seed) + sigma = rgn.randn(n, n) sigma -= np.min(sigma) - with pytest.raises(Exception, match='The covariance matrix is not positive-definite.'): + with pytest.raises( + Exception, match="The covariance matrix is not positive-definite." + ): _s_equi(sigma) # positive matrix while not np.all(np.linalg.eigvalsh(sigma) > tol): sigma += 0.1 * np.eye(n) - print(np.linalg.eigvalsh(sigma)) - with pytest.warns(UserWarning, match='The equi-correlated matrix'): + with pytest.warns(UserWarning, match="The equi-correlated matrix"): _s_equi(sigma) # positive definite matrix - sigma = sigma.T*sigma - sigma = (sigma + sigma.T)/2 - _s_equi(sigma) \ No newline at end of file + sigma = sigma.T * sigma + sigma = (sigma + sigma.T) / 2 + _s_equi(sigma) From df45c82e5d643120c996def2974a0a76724748ea Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Jan 2025 20:11:55 +0100 Subject: [PATCH 41/48] Undo delete of the tests --- test/test_stat_tools.py | 188 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 test/test_stat_tools.py diff --git a/test/test_stat_tools.py b/test/test_stat_tools.py new file mode 100644 index 00000000..5e5f8fa0 --- /dev/null +++ b/test/test_stat_tools.py @@ -0,0 +1,188 @@ +""" +Test the stat module +""" + +import numpy as np +from numpy.testing import assert_almost_equal, assert_equal + +from hidimstat.stat_tools import ( + _replace_infinity, + pval_corr_from_pval, + pval_from_cb, + pval_from_scale, + pval_from_two_sided_pval_and_sign, + two_sided_pval_from_cb, + two_sided_pval_from_pval, + two_sided_pval_from_zscore, + zscore_from_cb, + zscore_from_pval, +) + + +def test__replace_infinity(): + + x = np.asarray([10, np.inf, -np.inf]) + + # replace inf by the largest absolute value times two + x_clean = _replace_infinity(x) + expected = np.asarray([10, 20, -20]) + assert_equal(x_clean, expected) + + # replace inf by 40 + x_clean = _replace_infinity(x, replace_val=40) + expected = np.asarray([10, 40, -40]) + assert_equal(x_clean, expected) + + # replace inf by the largest absolute value plus one + x_clean = _replace_infinity(x, method="plus-one") + expected = np.asarray([10, 11, -11]) + assert_equal(x_clean, expected) + + +def test_pval_corr_from_pval(): + + pval = np.asarray([1.0, 0.025, 0.5]) + + # Correction for multiple testing: 3 features tested simultaneously + pval_corr = pval_corr_from_pval(pval) + expected = np.asarray([1.0, 0.075, 0.5]) + assert_almost_equal(pval_corr, expected, decimal=10) + + one_minus_pval = np.asarray([0.0, 0.975, 0.5]) + + # Correction for multiple testing: 3 features tested simultaneously + one_minus_pval_corr = pval_corr_from_pval(one_minus_pval) + expected = np.asarray([0.0, 0.925, 0.5]) + assert_almost_equal(one_minus_pval_corr, expected, decimal=10) + + +def test_pval_from_scale(): + + beta = np.asarray([-1.5, 1, 0]) + scale = np.asarray([0.25, 0.5, 0.5]) + + # Computing p-value and one minus the p-value. + pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_scale(beta, scale) + expected = np.asarray( + [[1.0, 0.022, 0.5], [1.0, 0.068, 0.5], [0.0, 0.978, 0.5], [0.0, 0.932, 0.5]] + ) + + assert_almost_equal(pval, expected[0], decimal=2) + assert_almost_equal(pval_corr, expected[1], decimal=2) + assert_almost_equal(one_minus_pval, expected[2], decimal=2) + assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2) + + +def test_zscore_from_cb(): + + cb_min = np.asarray([-2, 0, -1]) + cb_max = np.asarray([-1, 2, 1]) + + # Computing z-scores from 95% confidence-intervals assuming Gaussianity + zscore = zscore_from_cb(cb_min, cb_max) + expected = np.asarray([-5.87, 1.96, 0]) + + assert_almost_equal(zscore, expected, decimal=2) + + +def test_pval_from_cb(): + + cb_min = np.asarray([-2, 0, -1]) + cb_max = np.asarray([-1, 2, 1]) + + # Computing p-value and one minus the p-value. + pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb(cb_min, cb_max) + expected = np.asarray( + [[1.0, 0.025, 0.5], [1.0, 0.075, 0.5], [0.0, 0.975, 0.5], [0.0, 0.925, 0.5]] + ) + + assert_almost_equal(pval, expected[0], decimal=2) + assert_almost_equal(pval_corr, expected[1], decimal=2) + assert_almost_equal(one_minus_pval, expected[2], decimal=2) + assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2) + + +def test_two_sided_pval_from_zscore(): + + zscore = np.asarray([-5.87, 1.96, 0]) + + # Computing two-sided pval from z-scores assuming Gaussianity + two_sided_pval, two_sided_pval_corr = two_sided_pval_from_zscore(zscore) + expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) + + assert_almost_equal(two_sided_pval, expected[0], decimal=2) + assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) + + +def test_two_sided_pval_from_cb(): + + cb_min = np.asarray([-2, 0, -1]) + cb_max = np.asarray([-1, 2, 1]) + + # Computing two-sided pval from 95% confidence bounds assuming Gaussianity + two_sided_pval, two_sided_pval_corr = two_sided_pval_from_cb(cb_min, cb_max) + expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) + + assert_almost_equal(two_sided_pval, expected[0], decimal=2) + assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) + + +def test_zscore_from_pval(): + + pval = np.asarray([1.0, 0.025, 0.5, 0.975]) + + # Computing z-scores from p-value + zscore = zscore_from_pval(pval) + expected = _replace_infinity( + np.asarray([-np.inf, 1.96, 0, -1.96]), replace_val=40, method="plus-one" + ) + + assert_almost_equal(zscore, expected, decimal=2) + + pval = np.asarray([1.0, 0.025, 0.5, 0.975]) + one_minus_pval = np.asarray([0.0, 0.975, 0.5, 0.025]) + + # Computing z-scores from p-value and one minus the p-value + zscore = zscore_from_pval(pval, one_minus_pval) + expected = _replace_infinity( + np.asarray([-np.inf, 1.96, 0, -1.96]), replace_val=40, method="plus-one" + ) + + assert_almost_equal(zscore, expected, decimal=2) + + +def test_pval_from_two_sided_pval_and_sign(): + + two_sided_pval = np.asarray([0.025, 0.05, 0.5]) + parameter_sign = np.asarray([-1.0, 1.0, -1.0]) + + # One-sided p-values from two-sided p-value and sign. + pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign) + ) + expected = np.asarray( + [ + [0.9875, 0.025, 0.75], + [0.9625, 0.075, 0.5], + [0.0125, 0.975, 0.25], + [0.0375, 0.925, 0.5], + ] + ) + + assert_equal(pval, expected[0]) + assert_almost_equal(pval_corr, expected[1]) + assert_equal(one_minus_pval, expected[2]) + assert_almost_equal(one_minus_pval_corr, expected[3]) + + +def test_two_sided_pval_from_pval(): + + pval = np.asarray([1.0, 0.025, 0.5]) + one_minus_pval = np.asarray([0.0, 0.975, 0.5]) + + # Two-sided p-value from one-side p-values. + two_sided_pval, two_sided_pval_corr = two_sided_pval_from_pval(pval, one_minus_pval) + expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) + + assert_almost_equal(two_sided_pval, expected[0], decimal=2) + assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) From e3eeb7284ea7351f1725e1c4af6566b81ca633f7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 11:23:54 +0100 Subject: [PATCH 42/48] Improve coverage and test --- src/hidimstat/knockoffs.py | 5 +-- test/test_knockoff.py | 74 ++++++++++++++++++++++++++++---------- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 0c9d190b..0d86a879 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -316,9 +316,6 @@ def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): This function calculates the knockoff threshold based on the test statistics and the desired FDR level. It then identifies the selected variables based on the threshold. """ - if offset not in (0, 1): - raise ValueError("'offset' must be either 0 or 1") - # run the knockoff filter threshold = _knockoff_threshold(test_score, fdr=fdr, offset=offset) selected = np.where(test_score >= threshold)[0] @@ -561,7 +558,7 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None ): coef = np.ravel(estimator.best_estimator_.coef_) # for CV object else: - raise ValueError("estimator should be linear") + raise TypeError("estimator should be linear") # Equation 1.7 in barber2015controlling or 3.6 of candes2018panning test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) return test_score diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 13df16b1..7681815e 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -15,9 +15,11 @@ from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Lasso from sklearn.model_selection import KFold +from sklearn.tree import DecisionTreeRegressor def test_knockoff_aggregation(): + """ Test knockoff with aggregation""" n = 500 p = 100 snr = 5 @@ -26,7 +28,7 @@ def test_knockoff_aggregation(): X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) test_scores = model_x_knockoff_aggregation( - X, y, n_bootstraps=n_bootstraps, random_state=0 + X, y, n_bootstraps=n_bootstraps, random_state=None ) selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile( test_scores, fdr=fdr, selection_only=False @@ -99,6 +101,7 @@ def test_knockoff_aggregation(): def test_model_x_knockoff(): + """Test the selection of vatiabel from knockoff""" seed = 42 fdr = 0.2 n = 300 @@ -109,17 +112,31 @@ def test_model_x_knockoff(): test_score, fdr=fdr, ) + with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): + model_x_knockoff_filter(test_score, offset=2) + with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): + model_x_knockoff_filter(test_score, offset=-0.1) + ko_result_bis, threshold = model_x_knockoff_filter( + test_score, + fdr=fdr, + selection_only=False + ) + assert np.all(ko_result == ko_result_bis) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 ko_result = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=True) + ko_result_bis, pvals = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=False) + assert np.all(ko_result == ko_result_bis) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 + assert np.all( 0<=pvals) or np.all(pvals<=1) def test_model_x_knockoff_estimator(): + """ Test knockoff with a crossvalidation estimator""" seed = 42 fdr = 0.2 n = 300 @@ -141,6 +158,27 @@ def test_model_x_knockoff_estimator(): assert fdp <= 0.2 assert power > 0.7 +def test_model_x_knockoff_exception(): + n = 50 + p = 100 + seed = 45 + rgn = np.random.RandomState(seed) + X = rgn.randn(n, p) + y = rgn.randn(n) + with pytest.raises(TypeError, match="You should not use this function"): + model_x_knockoff( + X, + y, + estimator=Lasso(), + ) + with pytest.raises(TypeError, match="estimator should be linear"): + model_x_knockoff( + X, + y, + estimator=DecisionTreeRegressor(), + preconfigure_estimator=None + ) + def test_estimate_distribution(): """ @@ -178,6 +216,7 @@ def test_estimate_distribution(): def test_gaussian_knockoff_equi(): + """test function of gaussian knockoff""" seed = 42 n = 100 p = 50 @@ -195,21 +234,19 @@ def test_gaussian_knockoff_equi_warning(): seed = 42 n = 100 p = 50 - tol = 1e-7 + tol = 1e-14 rgn = np.random.RandomState(seed) X = rgn.randn(n, p) mu = X.mean(axis=0) # create a positive definite matrix - sigma = rgn.randn(p, p) - while not np.all(np.linalg.eigvalsh(sigma) > tol): - sigma += 0.1 * np.eye(p) - sigma = sigma.T * sigma - sigma *= 1e-13 + u, s, vh = np.linalg.svd(rgn.randn(p, p)) + d = np.eye(p) * tol/10 + sigma=u*d*u.T with pytest.warns( UserWarning, match="The conditional covariance matrix for knockoffs is not positive", ): - X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed * 2, tol=1e-10) + X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed * 2, tol=tol) assert X_tilde.shape == (n, p) @@ -220,22 +257,23 @@ def test_s_equi_not_define_positive(): tol = 1e-7 seed = 42 - # random matrix + # random positive matrix rgn = np.random.RandomState(seed) - sigma = rgn.randn(n, n) - sigma -= np.min(sigma) + a = rgn.randn(n, n) + a -= np.min(a) with pytest.raises( Exception, match="The covariance matrix is not positive-definite." ): - _s_equi(sigma) + _s_equi(a) - # positive matrix - while not np.all(np.linalg.eigvalsh(sigma) > tol): - sigma += 0.1 * np.eye(n) + # matrix with positive eigenvalues, positive diagonal + while not np.all(np.linalg.eigvalsh(a) > tol): + a += 0.1 * np.eye(n) with pytest.warns(UserWarning, match="The equi-correlated matrix"): - _s_equi(sigma) + _s_equi(a) # positive definite matrix - sigma = sigma.T * sigma - sigma = (sigma + sigma.T) / 2 + u, s, vh = np.linalg.svd(a) + d = np.eye(n) + sigma=u*d*u.T _s_equi(sigma) From 4df5d90a43eb879df035b24a2c8a3eb1f5402cbb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 11:24:35 +0100 Subject: [PATCH 43/48] Format --- test/test_knockoff.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 7681815e..84863f71 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -19,7 +19,7 @@ def test_knockoff_aggregation(): - """ Test knockoff with aggregation""" + """Test knockoff with aggregation""" n = 500 p = 100 snr = 5 @@ -117,26 +117,26 @@ def test_model_x_knockoff(): with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): model_x_knockoff_filter(test_score, offset=-0.1) ko_result_bis, threshold = model_x_knockoff_filter( - test_score, - fdr=fdr, - selection_only=False - ) + test_score, fdr=fdr, selection_only=False + ) assert np.all(ko_result == ko_result_bis) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 ko_result = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=True) - ko_result_bis, pvals = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=False) + ko_result_bis, pvals = model_x_knockoff_pvalue( + test_score, fdr=fdr, selection_only=False + ) assert np.all(ko_result == ko_result_bis) fdp, power = cal_fdp_power(ko_result, non_zero) assert fdp <= 0.2 assert power > 0.7 - assert np.all( 0<=pvals) or np.all(pvals<=1) + assert np.all(0 <= pvals) or np.all(pvals <= 1) def test_model_x_knockoff_estimator(): - """ Test knockoff with a crossvalidation estimator""" + """Test knockoff with a crossvalidation estimator""" seed = 42 fdr = 0.2 n = 300 @@ -158,6 +158,7 @@ def test_model_x_knockoff_estimator(): assert fdp <= 0.2 assert power > 0.7 + def test_model_x_knockoff_exception(): n = 50 p = 100 @@ -173,12 +174,9 @@ def test_model_x_knockoff_exception(): ) with pytest.raises(TypeError, match="estimator should be linear"): model_x_knockoff( - X, - y, - estimator=DecisionTreeRegressor(), - preconfigure_estimator=None + X, y, estimator=DecisionTreeRegressor(), preconfigure_estimator=None ) - + def test_estimate_distribution(): """ @@ -240,8 +238,8 @@ def test_gaussian_knockoff_equi_warning(): mu = X.mean(axis=0) # create a positive definite matrix u, s, vh = np.linalg.svd(rgn.randn(p, p)) - d = np.eye(p) * tol/10 - sigma=u*d*u.T + d = np.eye(p) * tol / 10 + sigma = u * d * u.T with pytest.warns( UserWarning, match="The conditional covariance matrix for knockoffs is not positive", @@ -275,5 +273,5 @@ def test_s_equi_not_define_positive(): # positive definite matrix u, s, vh = np.linalg.svd(a) d = np.eye(n) - sigma=u*d*u.T + sigma = u * d * u.T _s_equi(sigma) From 8aee16827ae7ad3192ad6e02a88c38229a502d0c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 18:08:43 +0100 Subject: [PATCH 44/48] Group the function agregation and not aggregation together --- doc_conf/api.rst | 1 - examples/plot_knockoff_aggregation.py | 6 +- src/hidimstat/__init__.py | 2 - src/hidimstat/knockoffs.py | 137 +++++++------------------- test/test_knockoff.py | 25 ++--- 5 files changed, 53 insertions(+), 118 deletions(-) diff --git a/doc_conf/api.rst b/doc_conf/api.rst index 710ff857..7490dc34 100644 --- a/doc_conf/api.rst +++ b/doc_conf/api.rst @@ -24,7 +24,6 @@ Functions group_reid hd_inference model_x_knockoff - model_x_knockoff_aggregation model_x_knockoff_filter model_x_knockoff_pvalue model_x_knockoff_bootstrap_quantile diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index d8d3fda9..145ab7aa 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -27,7 +27,6 @@ from hidimstat.knockoffs import ( model_x_knockoff, model_x_knockoff_filter, - model_x_knockoff_aggregation, model_x_knockoff_bootstrap_quantile, model_x_knockoff_bootstrap_e_value, ) @@ -78,13 +77,14 @@ def single_run( cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-6, ), - seed=seed, + n_bootstraps=1, + random_state=seed, ) mx_selection = model_x_knockoff_filter(test_scores, fdr=fdr) fdp_mx, power_mx = cal_fdp_power(mx_selection, non_zero_index) # Use p-values aggregation [2] - test_scores = model_x_knockoff_aggregation( + test_scores = model_x_knockoff( X, y, estimator=LassoCV( diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 5edfcbd1..d3c0121b 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -5,7 +5,6 @@ from .ensemble_clustered_inference import ensemble_clustered_inference from .knockoffs import ( model_x_knockoff, - model_x_knockoff_aggregation, model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_quantile, @@ -38,7 +37,6 @@ "group_reid", "hd_inference", "model_x_knockoff", - "model_x_knockoff_aggregation", "model_x_knockoff_filter", "model_x_knockoff_pvalue", "model_x_knockoff_bootstrap_quantile", diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 0d86a879..a239c8c1 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -79,7 +79,10 @@ def model_x_knockoff( preconfigure_estimator=preconfigure_estimator_LassoCV, centered=True, cov_estimator=LedoitWolf(assume_centered=True), - seed=None, + joblib_verbose=0, + n_bootstraps=1, + n_jobs=1, + random_state=None, tol_gauss=1e-14, ): """ @@ -89,7 +92,11 @@ def model_x_knockoff( to control the False Discovery Rate (FDR) based on :cite:`candes2018panning`. The original implementation can be found at https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R - The noisy variable are generate with a second-order knockoff variables using equi-correlated method. + The noisy variables are generated with second-order knockoff variables using the equi-correlated method. + + In addition, this function generates multiple sets of Gaussian knockoff variables and calculates + the test statistics for each set. It then aggregates the test statistics across + the sets to improve stability and power. Parameters ---------- @@ -126,95 +133,10 @@ def model_x_knockoff( attribute that returns a 2D array of the estimated covariance matrix. Examples include LedoitWolf and GraphicalLassoCV. - seed : int or None, optional (default=None) - The random seed used to generate the Gaussian knockoff variables. - - tol_gauss : float, optional (default=1e-14) - A tolerance value used for numerical stability in the calculation of the Cholesky decomposition in the gaussian generation function. - - Returns - ------- - test_score : 1D array, (n_features, ) - A vector of test statistics. - - Notes - ----- - This function calculates the test statistics based on the difference in absolute - coefficient values between the original and knockoff variables. - - References - ---------- - .. footbibliography:: - """ - if centered: - X = StandardScaler().fit_transform(X) - - # estimation of X distribution - # original implementation: - # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R - mu = X.mean(axis=0) - sigma = cov_estimator.fit(X).covariance_ - - # Create knockoff variables - X_tilde = gaussian_knockoff_generation(X, mu, sigma, seed=seed, tol=tol_gauss) - - test_score = _stat_coefficient_diff( - X, X_tilde, y, estimator, preconfigure_estimator - ) - - return test_score - - -def model_x_knockoff_aggregation( - X, - y, - estimator=LassoCV( - n_jobs=None, - verbose=0, - max_iter=200000, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-6, - ), - preconfigure_estimator=preconfigure_estimator_LassoCV, - centered=True, - cov_estimator=LedoitWolf(assume_centered=True), - joblib_verbose=0, - n_bootstraps=25, - n_jobs=1, - random_state=None, - tol_gauss=1e-14, -): - """ - Model-X Knockoff Aggregation - - This function generates multiple sets of Gaussian knockoff variables and calculates - the test statistics for each set. It then aggregates the test statistics across - the sets to improve stability and power. - - Parameters - ---------- - X : 2D ndarray (n_samples, n_features) - The design matrix. - - y : 1D ndarray (n_samples, ) - The target vector. - - estimator : sklearn estimator instance or a cross validation instance, optional - The estimator used for fitting the data and computing the test statistics. - - preconfigure_estimator : function, optional - A function that configures the estimator for the Model-X knockoff procedure. - - centered : bool, optional (default=True) - Whether to standardize the data before performing the inference procedure. - - cov_estimator : sklearn covariance estimator instance, optional - The method used to estimate the empirical covariance matrix. - joblib_verbose : int, optional (default=0) The verbosity level of the joblib parallel processing. - n_bootstraps : int, optional (default=25) + n_bootstraps : int, optional (default=1) The number of bootstrap samples to generate for aggregation. n_jobs : int, optional (default=1) @@ -236,8 +158,12 @@ def model_x_knockoff_aggregation( This function generates multiple sets of Gaussian knockoff variables and calculates the test statistics for each set using the `_stat_coefficient_diff` function. It then aggregates the test statistics across the sets to improve stability and power. + + References + ---------- + .. footbibliography:: """ - assert n_bootstraps > 1, "the number of bootstraps should at least higher than 1" + assert n_bootstraps > 0, "the number of bootstraps should at least higher than 1" # unnecessary to have n_jobs > number of bootstraps n_jobs = min(n_bootstraps, n_jobs) parallel = Parallel(n_jobs, verbose=joblib_verbose) @@ -255,22 +181,30 @@ def model_x_knockoff_aggregation( X = StandardScaler().fit_transform(X) # estimation of X distribution + # original implementation: + # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R mu = X.mean(axis=0) sigma = cov_estimator.fit(X).covariance_ # Create knockoff variables - X_tilde, (mu_tilde, sigma_tilde_decompose) = gaussian_knockoff_generation( - X, mu, sigma, seed=seed_list[0], tol=tol_gauss, repeat=True + data_generated = gaussian_knockoff_generation( + X, mu, sigma, seed=seed_list[0], tol=tol_gauss, repeat=(n_bootstraps > 1) ) - X_tildes = parallel( - delayed(repeat_gaussian_knockoff_generation)( - mu_tilde, - sigma_tilde_decompose, - seed=seed, + + if n_bootstraps == 1: + X_tilde = data_generated + X_tildes = [X_tilde] + else: + X_tilde, (mu_tilde, sigma_tilde_decompose) = data_generated + X_tildes = parallel( + delayed(repeat_gaussian_knockoff_generation)( + mu_tilde, + sigma_tilde_decompose, + seed=seed, + ) + for seed in seed_list[1:] ) - for seed in seed_list[1:] - ) - X_tildes.insert(0, X_tilde) + X_tildes.insert(0, X_tilde) test_scores = parallel( delayed(_stat_coefficient_diff)( @@ -279,7 +213,10 @@ def model_x_knockoff_aggregation( for i in range(n_bootstraps) ) - return test_scores + if n_bootstraps == 1: + return test_scores[0] + else: + return test_scores def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 84863f71..d1d180d1 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -1,5 +1,4 @@ from hidimstat.knockoffs import ( - model_x_knockoff_aggregation, model_x_knockoff, model_x_knockoff_filter, model_x_knockoff_pvalue, @@ -27,7 +26,7 @@ def test_knockoff_aggregation(): fdr = 0.5 X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) - test_scores = model_x_knockoff_aggregation( + test_scores = model_x_knockoff( X, y, n_bootstraps=n_bootstraps, random_state=None ) selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile( @@ -52,10 +51,8 @@ def test_knockoff_aggregation(): np.testing.assert_array_equal(selected_no_verbose, selected_verbose) # Single AKO (or vanilla KO) (verbose vs no verbose) - test_bootstrap = model_x_knockoff_aggregation(X, y, n_bootstraps=2, random_state=5) - test_no_bootstrap = model_x_knockoff( - X, y, seed=np.random.RandomState(5).randint(1, np.iinfo(np.int32).max, 2)[0] - ) + test_bootstrap = model_x_knockoff(X, y, n_bootstraps=2, random_state=5) + test_no_bootstrap = model_x_knockoff(X, y, n_bootstraps=1, random_state=5) np.testing.assert_array_equal(test_bootstrap[0], test_no_bootstrap) @@ -80,7 +77,7 @@ def test_knockoff_aggregation(): # Checking wrong type for random_state with pytest.raises(Exception): - _ = model_x_knockoff_aggregation( + _ = model_x_knockoff( X, y, random_state="test", @@ -107,7 +104,7 @@ def test_model_x_knockoff(): n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) - test_score = model_x_knockoff(X, y, seed=seed + 1) + test_score = model_x_knockoff(X, y, n_bootstraps=1, random_state=seed + 1) ko_result = model_x_knockoff_filter( test_score, fdr=fdr, @@ -146,8 +143,9 @@ def test_model_x_knockoff_estimator(): X, y, estimator=GridSearchCV(Lasso(), param_grid={"alpha": np.linspace(0.2, 0.3, 5)}), + n_bootstraps=1, preconfigure_estimator=None, - seed=seed + 1, + random_state=seed + 1, ) ko_result = model_x_knockoff_filter( test_score, @@ -171,10 +169,12 @@ def test_model_x_knockoff_exception(): X, y, estimator=Lasso(), + n_bootstraps=1, ) with pytest.raises(TypeError, match="estimator should be linear"): model_x_knockoff( - X, y, estimator=DecisionTreeRegressor(), preconfigure_estimator=None + X, y, estimator=DecisionTreeRegressor(), preconfigure_estimator=None, + n_bootstraps=1 ) @@ -188,7 +188,7 @@ def test_estimate_distribution(): p = 50 X, y, _, non_zero = simu_data(n, p, seed=seed) test_score = model_x_knockoff( - X, y, cov_estimator=LedoitWolf(assume_centered=True), seed=seed + 1 + X, y, cov_estimator=LedoitWolf(assume_centered=True), n_bootstraps=1, random_state=seed + 1 ) ko_result = model_x_knockoff_filter( test_score, @@ -203,7 +203,8 @@ def test_estimate_distribution(): alphas=[1e-3, 1e-2, 1e-1, 1], cv=KFold(n_splits=5, shuffle=True, random_state=0), ), - seed=seed + 2, + n_bootstraps=1, + random_state=seed + 2, ) ko_result = model_x_knockoff_filter( test_score, From bfe634657acf63d5b07e56754738e05a186a330a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 17 Jan 2025 18:11:44 +0100 Subject: [PATCH 45/48] formating --- test/test_knockoff.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index d1d180d1..6e6b9a9a 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -26,9 +26,7 @@ def test_knockoff_aggregation(): fdr = 0.5 X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) - test_scores = model_x_knockoff( - X, y, n_bootstraps=n_bootstraps, random_state=None - ) + test_scores = model_x_knockoff(X, y, n_bootstraps=n_bootstraps, random_state=None) selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile( test_scores, fdr=fdr, selection_only=False ) @@ -173,8 +171,11 @@ def test_model_x_knockoff_exception(): ) with pytest.raises(TypeError, match="estimator should be linear"): model_x_knockoff( - X, y, estimator=DecisionTreeRegressor(), preconfigure_estimator=None, - n_bootstraps=1 + X, + y, + estimator=DecisionTreeRegressor(), + preconfigure_estimator=None, + n_bootstraps=1, ) @@ -188,7 +189,11 @@ def test_estimate_distribution(): p = 50 X, y, _, non_zero = simu_data(n, p, seed=seed) test_score = model_x_knockoff( - X, y, cov_estimator=LedoitWolf(assume_centered=True), n_bootstraps=1, random_state=seed + 1 + X, + y, + cov_estimator=LedoitWolf(assume_centered=True), + n_bootstraps=1, + random_state=seed + 1, ) ko_result = model_x_knockoff_filter( test_score, From 4e3f4d28bb5a2a10bf2acf7090c9bec36407bd2b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 20 Jan 2025 14:08:37 +0100 Subject: [PATCH 46/48] Replace lambda by alpha --- src/hidimstat/knockoffs.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index a239c8c1..acf39bbf 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -13,13 +13,13 @@ from hidimstat.utils import fdr_threshold, quantile_aggregation -def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_lambdas=10): +def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_alphas=10): """ Configure the estimator for Model-X knockoffs. This function sets up the regularization path for the Lasso estimator - based on the input data and the number of lambdas to use. The regularization - path is defined by a sequence of lambda values, which control the amount + based on the input data and the number of alphas to use. The regularization + path is defined by a sequence of alpha values, which control the amount of shrinkage applied to the coefficient estimates. Parameters @@ -38,8 +38,8 @@ def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_lambdas=10): y : 1D ndarray (n_samples, ) The target vector. - n_lambdas : int, optional (default=10) - The number of lambda values to use to instantiate the cross-validation. + n_alphas : int, optional (default=10) + The number of alpha values to use to instantiate the cross-validation. Returns ------- @@ -61,9 +61,9 @@ def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_lambdas=10): n_features = X.shape[1] X_ko = np.column_stack([X, X_tilde]) - lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) - lambdas = np.linspace(lambda_max * np.exp(-n_lambdas), lambda_max, n_lambdas) - estimator.alphas = lambdas + alpha_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) + alphas = np.linspace(alpha_max * np.exp(-n_alphas), alpha_max, n_alphas) + estimator.alphas = alphas def model_x_knockoff( @@ -112,7 +112,7 @@ def model_x_knockoff( a 1D array, and a `coef_` attribute that returns a 1D array of coefficients. Examples include LassoCV, LogisticRegressionCV, and LinearRegression. Configuration example: - LassoCV(alphas=lambdas, n_jobs=None, verbose=0, max_iter=1000, + LassoCV(alphas=alphas, n_jobs=None, verbose=0, max_iter=1000, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-8) LogisticRegressionCV(penalty="l1", max_iter=1000, solver="liblinear", cv=KFold(n_splits=5, shuffle=True, random_state=0), n_jobs=None, tol=1e-8) From 56759698ecc134a984b616508d86477efcfba16d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 10 Feb 2025 16:56:46 +0100 Subject: [PATCH 47/48] Change cut in the API --- doc_conf/api.rst | 1 - src/hidimstat/__init__.py | 2 - src/hidimstat/knockoffs.py | 97 ++++++++++---------------- test/test_knockoff.py | 138 ++++++++++++++++++++++--------------- 4 files changed, 121 insertions(+), 117 deletions(-) diff --git a/doc_conf/api.rst b/doc_conf/api.rst index 7490dc34..fbdacaa2 100644 --- a/doc_conf/api.rst +++ b/doc_conf/api.rst @@ -24,7 +24,6 @@ Functions group_reid hd_inference model_x_knockoff - model_x_knockoff_filter model_x_knockoff_pvalue model_x_knockoff_bootstrap_quantile model_x_knockoff_bootstrap_e_value diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index d3c0121b..11381841 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -5,7 +5,6 @@ from .ensemble_clustered_inference import ensemble_clustered_inference from .knockoffs import ( model_x_knockoff, - model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_quantile, model_x_knockoff_bootstrap_e_value, @@ -37,7 +36,6 @@ "group_reid", "hd_inference", "model_x_knockoff", - "model_x_knockoff_filter", "model_x_knockoff_pvalue", "model_x_knockoff_bootstrap_quantile", "model_x_knockoff_bootstrap_e_value", diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index acf39bbf..ad7a2d86 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -84,6 +84,8 @@ def model_x_knockoff( n_jobs=1, random_state=None, tol_gauss=1e-14, + fdr=0.1, + offset=1, ): """ Model-X Knockoff @@ -148,6 +150,16 @@ def model_x_knockoff( tol_gauss : float, optional (default=1e-14) A tolerance value used for numerical stability in the calculation of the Cholesky decomposition in the gaussian generation function. + fdr : float, optional (default=0.1) + The desired controlled False Discovery Rate (FDR) level. + + offset : int, 0 or 1, optional (default=1) + The offset to calculate the knockoff threshold. An offset of 1 is equivalent to + knockoff+. + + threshold : float + The knockoff threshold. + Returns ------- test_scores : 2D ndarray (n_bootstraps, n_features) @@ -156,8 +168,10 @@ def model_x_knockoff( Notes ----- This function generates multiple sets of Gaussian knockoff variables and calculates - the test statistics for each set using the `_stat_coefficient_diff` function. It - then aggregates the test statistics across the sets to improve stability and power. + the test statistics for each set using the `_stat_coefficient_diff` function. This + _stat_coefficient_diff calculates the knockoff threshold based on the test statistics + and the desired FDR level. It then identifies the selected variables based on the + threshold. It ehen aggregates the result across the sets to improve stability and power. References ---------- @@ -206,61 +220,18 @@ def model_x_knockoff( ) X_tildes.insert(0, X_tilde) - test_scores = parallel( + results = parallel( delayed(_stat_coefficient_diff)( - X, X_tildes[i], y, estimator, preconfigure_estimator + X, X_tildes[i], y, estimator, fdr, offset, preconfigure_estimator ) for i in range(n_bootstraps) ) + test_scores, threshold, selected = zip(*results) if n_bootstraps == 1: - return test_scores[0] + return selected[0], test_scores[0], threshold[0], X_tildes[0] else: - return test_scores - - -def model_x_knockoff_filter(test_score, fdr=0.1, offset=1, selection_only=True): - """ - Calculate the p-values and return the selected variables based on the knockoff filter. - - Parameters - ---------- - test_score : 1D array, (n_features, ) - A vector of test statistics. - - fdr : float, optional (default=0.1) - The desired controlled False Discovery Rate (FDR) level. - - offset : int, 0 or 1, optional (default=1) - The offset to calculate the knockoff threshold. An offset of 1 is equivalent to - knockoff+. - - selection_only : bool, optional (default=True) - Whether to return only the selected variables or additional information. - If True, the function will return only the selected variables. If False, - the function will return the selected variables and the threshold. - - Returns - ------- - selected : 1D array, int - A vector of indices of the selected variables. - - threshold : float - The knockoff threshold. - - Notes - ----- - This function calculates the knockoff threshold based on the test statistics and the - desired FDR level. It then identifies the selected variables based on the threshold. - """ - # run the knockoff filter - threshold = _knockoff_threshold(test_score, fdr=fdr, offset=offset) - selected = np.where(test_score >= threshold)[0] - - if selection_only: - return selected - else: - return selected, threshold + return selected, test_scores, threshold, X_tildes def model_x_knockoff_pvalue( @@ -313,7 +284,7 @@ def model_x_knockoff_pvalue( def model_x_knockoff_bootstrap_e_value( - test_scores, fdr=0.1, offset=1, selection_only=True + test_scores, ko_threshold, fdr=0.1, offset=1, selection_only=True ): """ This function implements the computation of the empirical e-values @@ -324,6 +295,9 @@ def model_x_knockoff_bootstrap_e_value( test_scores : 2D array, (n_bootstraps, n_features) A matrix of test statistics for each bootstrap sample. + ko_threshold : float + Threshold level. + fdr : float, optional (default=0.1) The desired controlled False Discovery Rate (FDR) level. @@ -354,7 +328,7 @@ def model_x_knockoff_bootstrap_e_value( n_bootstraps = len(test_scores) evals = np.array( [ - _empirical_knockoff_eval(test_scores[i], fdr / 2, offset) + _empirical_knockoff_eval(test_scores[i], ko_threshold[i], offset) for i in range(n_bootstraps) ] ) @@ -454,7 +428,9 @@ def model_x_knockoff_bootstrap_quantile( return selected, aggregated_pval, pvals -def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None): +def _stat_coefficient_diff( + X, X_tilde, y, estimator, fdr, offset, preconfigure_estimator=None +): """ Compute statistic based on a cross-validation procedure. @@ -498,7 +474,12 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, preconfigure_estimator=None raise TypeError("estimator should be linear") # Equation 1.7 in barber2015controlling or 3.6 of candes2018panning test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) - return test_score + + # Compute the threshold level and selecte the important variables + ko_thr = _knockoff_threshold(test_score, fdr=fdr, offset=offset) + selected = np.where(test_score >= ko_thr)[0] + + return test_score, ko_thr, selected def _knockoff_threshold(test_score, fdr=0.1, offset=1): @@ -577,7 +558,7 @@ def _empirical_knockoff_pval(test_score, offset=1): return np.array(pvals) -def _empirical_knockoff_eval(test_score, fdr=0.1, offset=1): +def _empirical_knockoff_eval(test_score, ko_thr, offset=1): """ Compute the empirical e-values from the knockoff test. @@ -586,8 +567,8 @@ def _empirical_knockoff_eval(test_score, fdr=0.1, offset=1): test_score : 1D ndarray, shape (n_features, ) Vector of test statistics. - fdr : float, optional - Desired controlled FDR (false discovery rate) level. + threshold : float + Threshold level. offset : int, 0 or 1, optional Offset equals 1 is the knockoff+ procedure. @@ -603,8 +584,6 @@ def _empirical_knockoff_eval(test_score, fdr=0.1, offset=1): if offset not in (0, 1): raise ValueError("'offset' must be either 0 or 1") - ko_thr = _knockoff_threshold(test_score, fdr=fdr, offset=offset) - for i in range(n_features): if test_score[i] < ko_thr: evals.append(0) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 6e6b9a9a..b93b2ca4 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -1,6 +1,5 @@ from hidimstat.knockoffs import ( model_x_knockoff, - model_x_knockoff_filter, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_e_value, model_x_knockoff_bootstrap_quantile, @@ -17,8 +16,8 @@ from sklearn.tree import DecisionTreeRegressor -def test_knockoff_aggregation(): - """Test knockoff with aggregation""" +def test_knockoff_bootstrap_quantile(): + """Test bootstrap knockoof with quantile aggregation""" n = 500 p = 100 snr = 5 @@ -26,17 +25,18 @@ def test_knockoff_aggregation(): fdr = 0.5 X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) - test_scores = model_x_knockoff(X, y, n_bootstraps=n_bootstraps, random_state=None) + selected, test_scores, threshold, X_tildes = model_x_knockoff( + X, y, n_bootstraps=n_bootstraps, random_state=None, fdr=fdr + ) + selected_verbose, aggregated_pval, pvals = model_x_knockoff_bootstrap_quantile( test_scores, fdr=fdr, selection_only=False ) - fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) selected_no_verbose = model_x_knockoff_bootstrap_quantile( test_scores, fdr=fdr, selection_only=True ) - fdp_no_verbose, power_no_verbose = cal_fdp_power( selected_no_verbose, non_zero_index ) @@ -48,76 +48,113 @@ def test_knockoff_aggregation(): assert power_no_verbose > 0.1 np.testing.assert_array_equal(selected_no_verbose, selected_verbose) - # Single AKO (or vanilla KO) (verbose vs no verbose) - test_bootstrap = model_x_knockoff(X, y, n_bootstraps=2, random_state=5) - test_no_bootstrap = model_x_knockoff(X, y, n_bootstraps=1, random_state=5) + # Checking value for offset not belonging to (0,1) + with pytest.raises(Exception): + _ = model_x_knockoff_bootstrap_quantile( + test_scores, + offset=2, + ) - np.testing.assert_array_equal(test_bootstrap[0], test_no_bootstrap) - # Using e-values aggregation (verbose vs no verbose) +def test_knockoff_bootstrap_e_values(): + """Test bootstrap Knockoff with e-values""" + n = 500 + p = 100 + snr = 5 + n_bootstraps = 25 + fdr = 0.5 + X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) + + selected, test_scores, threshold, X_tildes = model_x_knockoff( + X, y, n_bootstraps=n_bootstraps, random_state=None, fdr=fdr / 2 + ) + # Using e-values aggregation (verbose vs no verbose) selected_verbose, aggregated_eval, evals = model_x_knockoff_bootstrap_e_value( - test_scores, fdr=fdr, selection_only=False + test_scores, threshold, fdr=fdr, selection_only=False ) fdp_verbose, power_verbose = cal_fdp_power(selected_verbose, non_zero_index) + selected_no_verbose = model_x_knockoff_bootstrap_e_value( - test_scores, fdr=fdr, selection_only=True + test_scores, threshold, fdr=fdr, selection_only=True ) fdp_no_verbose, power_no_verbose = cal_fdp_power( selected_no_verbose, non_zero_index ) - assert pvals.shape == (n_bootstraps, p) assert fdp_verbose < 0.5 assert power_verbose > 0.1 assert fdp_no_verbose < 0.5 assert power_no_verbose > 0.1 - # Checking wrong type for random_state - with pytest.raises(Exception): - _ = model_x_knockoff( - X, - y, - random_state="test", - ) - # Checking value for offset not belonging to (0,1) with pytest.raises(Exception): - _ = model_x_knockoff_bootstrap_quantile( + _ = model_x_knockoff_bootstrap_e_value( test_scores, + threshold, offset=2, ) + +def test_invariant_with_bootstrap(): + """Test bootstrap Knockoff""" + n = 500 + p = 100 + snr = 5 + fdr = 0.5 + X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) + # Single AKO (or vanilla KO) (verbose vs no verbose) + ( + selected_bootstrap, + test_scores_bootstrap, + threshold_bootstrap, + X_tildes_bootstrap, + ) = model_x_knockoff(X, y, n_bootstraps=2, random_state=5, fdr=fdr) + ( + selected_no_bootstrap, + test_scores_no_bootstrap, + threshold_no_bootstrap, + X_tildes_no_bootstrap, + ) = model_x_knockoff(X, y, n_bootstraps=1, random_state=5, fdr=fdr) + + np.testing.assert_array_equal(test_scores_bootstrap[0], test_scores_no_bootstrap) + np.testing.assert_array_equal(selected_bootstrap[0], selected_no_bootstrap) + np.testing.assert_array_equal(threshold_bootstrap[0], threshold_no_bootstrap) + np.testing.assert_array_equal(X_tildes_bootstrap[0], X_tildes_no_bootstrap) + + +def test_knockoff_exception(): + """Test exception raise by Knockoff""" + n = 500 + p = 100 + snr = 5 + X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) + + # Checking wrong type for random_state with pytest.raises(Exception): - _ = model_x_knockoff_bootstrap_e_value( - test_scores, - offset=2, + _ = model_x_knockoff( + X, + y, + random_state="test", ) def test_model_x_knockoff(): - """Test the selection of vatiabel from knockoff""" + """Test the selection of variable from knockoff""" seed = 42 fdr = 0.2 n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) - test_score = model_x_knockoff(X, y, n_bootstraps=1, random_state=seed + 1) - ko_result = model_x_knockoff_filter( - test_score, - fdr=fdr, + selected, test_score, threshold, X_tildes = model_x_knockoff( + X, y, n_bootstraps=1, random_state=seed + 1, fdr=fdr ) with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): - model_x_knockoff_filter(test_score, offset=2) + model_x_knockoff(X, y, n_bootstraps=1, random_state=seed + 1, offset=2, fdr=fdr) with pytest.raises(ValueError, match="'offset' must be either 0 or 1"): - model_x_knockoff_filter(test_score, offset=-0.1) - ko_result_bis, threshold = model_x_knockoff_filter( - test_score, fdr=fdr, selection_only=False - ) - assert np.all(ko_result == ko_result_bis) - fdp, power = cal_fdp_power(ko_result, non_zero) - assert fdp <= 0.2 - assert power > 0.7 + model_x_knockoff( + X, y, n_bootstraps=1, random_state=seed + 1, offset=-0.1, fdr=fdr + ) ko_result = model_x_knockoff_pvalue(test_score, fdr=fdr, selection_only=True) ko_result_bis, pvals = model_x_knockoff_pvalue( @@ -137,19 +174,16 @@ def test_model_x_knockoff_estimator(): n = 300 p = 300 X, y, _, non_zero = simu_data(n, p, seed=seed) - test_score = model_x_knockoff( + selected, test_scores, threshold, X_tildes = model_x_knockoff( X, y, estimator=GridSearchCV(Lasso(), param_grid={"alpha": np.linspace(0.2, 0.3, 5)}), n_bootstraps=1, preconfigure_estimator=None, random_state=seed + 1, - ) - ko_result = model_x_knockoff_filter( - test_score, fdr=fdr, ) - fdp, power = cal_fdp_power(ko_result, non_zero) + fdp, power = cal_fdp_power(selected, non_zero) assert fdp <= 0.2 assert power > 0.7 @@ -188,20 +222,17 @@ def test_estimate_distribution(): n = 100 p = 50 X, y, _, non_zero = simu_data(n, p, seed=seed) - test_score = model_x_knockoff( + selected, test_scores, threshold, X_tildes = model_x_knockoff( X, y, cov_estimator=LedoitWolf(assume_centered=True), n_bootstraps=1, random_state=seed + 1, - ) - ko_result = model_x_knockoff_filter( - test_score, fdr=fdr, ) - for i in ko_result: + for i in selected: assert np.any(i == non_zero) - test_score = model_x_knockoff( + selected, test_scores, threshold, X_tildes = model_x_knockoff( X, y, cov_estimator=GraphicalLassoCV( @@ -210,12 +241,9 @@ def test_estimate_distribution(): ), n_bootstraps=1, random_state=seed + 2, - ) - ko_result = model_x_knockoff_filter( - test_score, fdr=fdr, ) - for i in ko_result: + for i in selected: assert np.any(i == non_zero) From 70a9a637deb8165ef7c6a967261e7fc0df963e94 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 12 Feb 2025 10:32:54 +0100 Subject: [PATCH 48/48] Formating file --- src/hidimstat/gaussian_knockoff.py | 55 ++++++++++++++++++------------ test/test_knockoff.py | 1 + 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py index ea17917a..821cf74c 100644 --- a/src/hidimstat/gaussian_knockoff.py +++ b/src/hidimstat/gaussian_knockoff.py @@ -7,9 +7,9 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals Generate second-order knockoff variables using the equi-correlated method. This function generates knockoff variables for a given design matrix X, - using the equi-correlated method described in :cite:`barber2015controlling` and - :cite:`candes2018panning`. The function takes as input the design matrix X, - the vector of empirical mean values mu, and the empirical covariance + using the equi-correlated method described in :cite:`barber2015controlling` + and :cite:`candes2018panning`. The function takes as input the design matrix + X, the vector of empirical mean values mu, and the empirical covariance matrix sigma. It returns the knockoff variables X_tilde. The original implementation can be found at @@ -27,14 +27,17 @@ def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14, repeat=Fals The empirical covariance matrix. seed : int, optional - A random seed for generating the uniform noise used to create the knockoff variables. + A random seed for generating the uniform noise used to create + the knockoff variables. tol : float, default=1.e-14 - A tolerance value used for numerical stability in the calculation of the Cholesky decomposition. + A tolerance value used for numerical stability in the calculation + of the Cholesky decomposition. repeat : bool, default=False - If True, the function returns the values used to generate the knockoff variables (mu_tilde and sigma_tilde_decompose), - which can be used to generate additional knockoff variables without having to recompute them. + If True, the function returns the values used to generate the knockoff + variables (mu_tilde and sigma_tilde_decompose), which can be used to + generate additional knockoff variables without having to recompute them. Returns ------- @@ -82,22 +85,26 @@ def repeat_gaussian_knockoff_generation(mu_tilde, sigma_tilde_decompose, seed): """ Generate additional knockoff variables using pre-computed values. - This function generates additional knockoff variables using pre-computed values returned by the - gaussian_knockoff_generation function with repeat=True. It takes as input mu_tilde and - sigma_tilde_decompose, which were returned by gaussian_knockoff_generation, and a random seed. + This function generates additional knockoff variables using pre-computed + values returned by the gaussian_knockoff_generation function + with repeat=True. It takes as input mu_tilde and sigma_tilde_decompose, + which were returned by gaussian_knockoff_generation, and a random seed. It returns the new knockoff variables X_tilde. Parameters ---------- mu_tilde : 2D ndarray (n_samples, n_features) - The matrix of means used to generate the knockoff variables, returned by gaussian_knockoff_generation. + The matrix of means used to generate the knockoff variables, + returned by gaussian_knockoff_generation. sigma_tilde_decompose : 2D ndarray (n_features, n_features) - The Cholesky decomposition of the covariance matrix used to generate the knockoff variables, - returned by gaussian_knockoff_generation. + The Cholesky decomposition of the covariance matrix used + to generate the knockoff variables,returned by + gaussian_knockoff_generation. seed : int - A random seed for generating the uniform noise used to create the knockoff variables. + A random seed for generating the uniform noise used to create + the knockoff variables. Returns ------- @@ -116,20 +123,26 @@ def repeat_gaussian_knockoff_generation(mu_tilde, sigma_tilde_decompose, seed): def _s_equi(sigma, tol=1e-14): """ - Estimate the diagonal matrix of correlation between real and knockoff variables using the equi-correlated equation. + Estimate the diagonal matrix of correlation between real + and knockoff variables using the equi-correlated equation. - This function estimates the diagonal matrix of correlation between real and knockoff variables using the - equi-correlated equation described in :cite:`barber2015controlling` and - :cite:`candes2018panning`. It takes as input the empirical covariance matrix sigma and a tolerance value tol, - and returns a vector of diagonal values of the estimated matrix diag{s}. + This function estimates the diagonal matrix of correlation + between real and knockoff variables using the equi-correlated + equation described in :cite:`barber2015controlling` and + :cite:`candes2018panning`. It takes as input the empirical + covariance matrix sigma and a tolerance value tol, + and returns a vector of diagonal values of the estimated + matrix diag{s}. Parameters ---------- sigma : 2D ndarray (n_features, n_features) - The empirical covariance matrix calculated from the original design matrix. + The empirical covariance matrix calculated from + the original design matrix. tol : float, optional - A tolerance value used for numerical stability in the calculation of the eigenvalues of the correlation matrix. + A tolerance value used for numerical stability in the calculation + of the eigenvalues of the correlation matrix. Returns ------- diff --git a/test/test_knockoff.py b/test/test_knockoff.py index b93b2ca4..65e8d04c 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -190,6 +190,7 @@ def test_model_x_knockoff_estimator(): def test_model_x_knockoff_exception(): + "Test the exception raise by model_x_knockoff" n = 50 p = 100 seed = 45