Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the coverage of the tests and fix some minor bugs #64

Merged
merged 28 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
6507587
add test for clsuter and a method for getting groups
lionelkusch Dec 16, 2024
2233a5f
Add test for dcrt
lionelkusch Dec 16, 2024
cb02e0c
Add commit for desparsified functions and fixed one bug
lionelkusch Dec 16, 2024
d36dc68
Add tests for ensemble clustered
lionelkusch Dec 16, 2024
65282b5
Add test for the knockoff functions.
lionelkusch Dec 16, 2024
968500c
Add test for Reid methods
lionelkusch Dec 16, 2024
05628cf
Add test for permutation_test
lionelkusch Dec 16, 2024
744f201
Add tests and fixe name of one variable
lionelkusch Dec 16, 2024
f3473c7
Add test and change name of variables
lionelkusch Dec 16, 2024
9b872a8
Unecessary file
lionelkusch Dec 16, 2024
4683e58
Fix typo
lionelkusch Dec 18, 2024
a830f04
Fix syntax
lionelkusch Dec 18, 2024
63e6c54
Fix bugs by changing name of groups
lionelkusch Dec 18, 2024
3dfc30e
Modified scenario with group
lionelkusch Dec 19, 2024
369ccdb
Fix typo
lionelkusch Dec 19, 2024
7e03524
don't use global variable
lionelkusch Jan 6, 2025
c0f2b5a
Merge branch 'main' into PR_test_increase
lionelkusch Jan 6, 2025
794310e
Apply suggestions from code review
lionelkusch Jan 7, 2025
9c69a8f
Change pvalue to evalue
lionelkusch Jan 7, 2025
12edb23
Fix bugs and typo
lionelkusch Jan 7, 2025
e06122f
Apply suggestions from code review
lionelkusch Jan 17, 2025
cee3f9a
Update test_scenario.py
lionelkusch Jan 17, 2025
0937e9c
Reverse the option group
lionelkusch Jan 17, 2025
d5b0e28
Merge branch 'main' into PR_test_increase
lionelkusch Jan 17, 2025
852106f
Remove unecessesary import
lionelkusch Jan 17, 2025
943fe11
Format file
lionelkusch Jan 17, 2025
85b44a3
small difference
lionelkusch Jan 20, 2025
85a6026
remove file setup
lionelkusch Jan 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/hidimstat/desparsified_lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _compute_residuals(

else:

ValueError("The only regression method available is 'lasso'")
raise ValueError("The only regression method available is 'lasso'")

clf.fit(X_new, y)
z = y - clf.predict(X_new)
Expand Down
4 changes: 3 additions & 1 deletion src/hidimstat/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,21 @@ def multivariate_1D_simulation(

rng = np.random.default_rng(seed)

# generate random data for each samples
X = np.zeros((n_samples, n_features))
X[:, 0] = rng.standard_normal(n_samples)

for i in np.arange(1, n_features):
rand_vector = ((1 - rho**2) ** 0.5) * rng.standard_normal(n_samples)
X[:, i] = rho * X[:, i - 1] + rand_vector

if shuffle:
rng.shuffle(X.T)

# generate the vector of variable of importances
beta = np.zeros(n_features)
beta[0:support_size] = 1.0

# generate the simulated regression data
noise = sigma * rng.standard_normal(n_samples)
y = np.dot(X, beta) + noise

Expand Down
14 changes: 0 additions & 14 deletions src/hidimstat/setup.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/hidimstat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _fixed_quantile_aggregation(pvals, gamma=0.5):
def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
"""adaptive version of the quantile aggregation method, Meinshausen et al.
(2008)"""
gammas = np.arange(gamma_min, 1.05, 0.05)
gammas = np.linspace(gamma_min, 1.0, 30)
list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])

return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))
Expand Down
122 changes: 112 additions & 10 deletions test/test_clustered_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Test the clustered_inference module
"""

import pytest
import numpy as np
from numpy.testing import assert_almost_equal
from sklearn.cluster import FeatureAgglomeration
Expand All @@ -14,16 +15,17 @@
)


def test_clustered_inference():
"""Testing the procedure on two simulations with a 1D data structure and
with n << p: the first test has no temporal dimension, the second has a
temporal dimension. The support is connected and of size 10, it must be
recovered with a small spatial tolerance parametrized by `margin_size`.
# Scenario 1: data with no temporal dimension
def test_clustered_inference_no_temporal():
"""
Testing the procedure on one simulations with a 1D data structure and
with n << p: no temporal dimension. The support is connected and of
size 10, it must be recovered with a small spatial tolerance
parametrized by `margin_size`.
Computing one sided p-values, we want low p-values for the features of
the support and p-values close to 0.5 for the others."""
the support and p-values close to 0.5 for the others.
"""

# Scenario 1: data with no temporal dimension
# ###########################################
n_samples, n_features = 100, 2000
support_size = 10
sigma = 5.0
Expand Down Expand Up @@ -63,8 +65,17 @@ def test_clustered_inference():
pval_corr[extended_support:200], expected[extended_support:200], decimal=1
)

# Scenario 2: temporal data
# #########################

# Scenario 2: temporal data
def test_clustered_inference_temporal():
"""
Testing the procedure on two simulations with a 1D data structure and
with n << p: with a temporal dimension. The support is connected and
of size 10, it must be recovered with a small spatial tolerance
parametrized by `margin_size`.
Computing one sided p-values, we want low p-values for the features of
the support and p-values close to 0.5 for the others.
"""
n_samples, n_features, n_times = 200, 2000, 10
support_size = 10
sigma = 5.0
Expand Down Expand Up @@ -104,3 +115,94 @@ def test_clustered_inference():
assert_almost_equal(
pval_corr[extended_support:], expected[extended_support:], decimal=1
)


# Scenario 3: data with no temporal dimension and with groups
def test_clustered_inference_no_temporal_groups():
"""
Testing the procedure on one simulations with a 1D data structure and
with n << p: no temporal dimension. The support is connected and of
size 10, it must be recovered with a small spatial tolerance
parametrized by `margin_size`.
We group the sample in 10 groups of size 10.
Computing one sided p-values, we want low p-values for the features of
the support and p-values close to 0.5 for the others.
"""

n_samples, n_features = 20, 2000
support_size = 10
n_groups = 10
sigma = 5.0
rho = 0.95
n_clusters = 200
margin_size = 5
interior_support = support_size - margin_size
extended_support = support_size + margin_size

# create n_group of samples
X_ = []
y_ = []
for i in range(n_groups):
X_init, y, beta, epsilon = multivariate_1D_simulation(
n_samples=n_samples,
n_features=n_features,
support_size=support_size,
sigma=sigma,
rho=rho,
shuffle=False,
seed=2 + i,
)
X_.append(X_init)
y_.append(y)

y_ = np.concatenate(y_)
y_ = y_ - np.mean(y_)
X_ = np.concatenate(X_)
X_ = X_ - np.mean(X_, axis=0)
groups = np.repeat(np.arange(0, n_groups), n_samples)

connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
ward = FeatureAgglomeration(
n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
)

beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
clustered_inference(X_, y_, ward, n_clusters, groups=groups)
)

expected = 0.5 * np.ones(n_features)
expected[:support_size] = 0.0

assert_almost_equal(pval_corr[:interior_support], expected[:interior_support])
assert_almost_equal(
pval_corr[extended_support:200], expected[extended_support:200], decimal=1
)


# Scenario 1: data with no temporal dimension
def test_clustered_inference_exception_methods():
"""
Testing the procedure on two simulations with a 1D data structure and
checking that the procedure raises an exception when an unknown method is
provided.
"""
n_samples, n_features = 100, 2000
n_clusters = 200

X_init, y, beta, epsilon = multivariate_1D_simulation(
n_samples=n_samples,
n_features=n_features,
shuffle=False,
seed=2,
)

y = y - np.mean(y)
X_init = X_init - np.mean(X_init, axis=0)

connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
ward = FeatureAgglomeration(
n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
)

with pytest.raises(ValueError, match="Unknow method"):
clustered_inference(X_init, y, ward, n_clusters, method="lll")
31 changes: 28 additions & 3 deletions test/test_dcrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_dcrt_lasso():
"""
X, y = make_regression(n_samples=100, n_features=10, noise=0.2, random_state=2024)
# Checking if a loss != 'least_square'
with pytest.raises(Exception):
with pytest.raises(ValueError, match="test loss is not supported."):
_ = dcrt_zero(
X,
y,
Expand All @@ -27,7 +27,7 @@ def test_dcrt_lasso():
)

# Checking for a different statistic
with pytest.raises(Exception):
with pytest.raises(ValueError, match="test statistic is not supported."):
_ = dcrt_zero(
X,
y,
Expand All @@ -37,6 +37,30 @@ def test_dcrt_lasso():
random_state=2024,
)

# Checking for bad selection of screening_threshold
result_th_screen_bad = dcrt_zero(
X,
y,
screening_threshold=0,
screening=True,
verbose=False,
)
assert result_th_screen_bad.size == 0

# Checking for bad selection of screening_threshold with verbose
result_th_screen_bad = dcrt_zero(
X,
y,
screening_threshold=0,
screening=True,
verbose=True,
)

assert len(result_th_screen_bad) == 3
assert result_th_screen_bad[0].size == 0
assert np.all(result_th_screen_bad[1] == np.ones(10))
assert np.all(result_th_screen_bad[2] == np.zeros(10))

# Checking with and without screening
results_no_screening = dcrt_zero(
X, y, screening=False, verbose=True, statistic="residual", random_state=2024
Expand Down Expand Up @@ -70,7 +94,8 @@ def test_dcrt_lasso():
X,
y,
refit=True,
screening=False,
screening=True,
screening_threshold=50,
verbose=True,
statistic="residual",
random_state=2024,
Expand Down
11 changes: 11 additions & 0 deletions test/test_desparsified_lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from numpy.testing import assert_almost_equal, assert_equal
from scipy.linalg import toeplitz
import pytest

from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso
from hidimstat.scenario import (
Expand Down Expand Up @@ -48,6 +49,16 @@ def test_desparsified_lasso():
assert_equal(cb_max > beta, True)


def test_desparsified_lasso_exception():
"""Testing exception of not using lasso"""

X, y, beta, noise = multivariate_1D_simulation()
with pytest.raises(
ValueError, match="The only regression method available is 'lasso'"
):
_ = desparsified_lasso(X, y, residual_method="test")


def test_desparsified_group_lasso():
"""Testing the procedure on a simulation with no structure and
a support of size 2. Computing one-sided p-values, we want
Expand Down
52 changes: 52 additions & 0 deletions test/test_ensemble_clustered_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import numpy as np
import pytest
from numpy.testing import assert_almost_equal
from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_extraction import image
Expand Down Expand Up @@ -125,3 +126,54 @@ def test_ensemble_clustered_inference():
assert_almost_equal(
pval_corr[extended_support:], expected[extended_support:], decimal=1
)

beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
ensemble_clustered_inference(
X,
Y,
ward,
n_clusters,
n_bootstraps=n_bootstraps,
inference_method=inference_method,
ensembling_method="medians",
)
)

expected = 0.5 * np.ones(n_features)
expected[:support_size] = 0.0

assert_almost_equal(
pval_corr[:interior_support], expected[:interior_support], decimal=3
)
assert_almost_equal(
pval_corr[extended_support:], expected[extended_support:], decimal=1
)


def test_ensemble_clustered_inference_exception():
"""
Test the raise of exception
"""
n_samples, n_features = 100, 2000
n_clusters = 10
X, Y, beta, epsilon = multivariate_1D_simulation(
n_samples=n_samples,
n_features=n_features,
)
connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
ward = FeatureAgglomeration(
n_clusters=n_clusters, connectivity=connectivity, linkage="ward"
)

# Test the raise of exception
with pytest.raises(ValueError, match="Unknown ensembling method."):
ensemble_clustered_inference(
X, Y, ward, n_clusters, ensembling_method="wrong_method"
)

with pytest.raises(
ValueError,
match="'memory' must be None or a string corresponding "
+ "to the path of the caching directory.",
):
ensemble_clustered_inference(X, Y, ward, n_clusters, memory=[])
Loading
Loading