-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #122 from lionelkusch/PR_estimation_threshold
Estimation threshold(1/4): add comments and docstring of the functions
- Loading branch information
Showing
8 changed files
with
172 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import numpy as np | ||
from numpy.linalg import norm | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn.svm import LinearSVR | ||
|
||
|
||
def empirical_thresholding( | ||
X, | ||
y, | ||
linear_estimator=GridSearchCV( | ||
LinearSVR(), param_grid={"C": np.logspace(-7, 1, 9)}, n_jobs=None | ||
), | ||
): | ||
""" | ||
Perform empirical thresholding on the input data and target using a linear | ||
estimator. | ||
This function fits a linear estimator to the input data and target, | ||
and then uses the estimated coefficients to perform empirical thresholding. | ||
The threshold is calculated for keeping only extreme coefficients. | ||
For more details, see the section 6.3.2 of :cite:`chevalier_statistical_2020` | ||
Parameters | ||
---------- | ||
X : ndarray, shape (n_samples, n_features) | ||
The input data. | ||
y : ndarray, shape (n_samples,) | ||
The target values. | ||
linear_estimator : estimator object, optional (default=GridSearchCV( | ||
LinearSVR(),param_grid={"C": np.logspace(-7, 1, 9)}, n_jobs=None)) | ||
The linear estimator to use for thresholding. It should be a scikit-learn | ||
estimator object that implements the `fit` method and has a `coef_` | ||
attribute or a `best_estimator_` attribute with a `coef_` attribute | ||
(e.g., a `GridSearchCV` object). | ||
Returns | ||
------- | ||
beta_hat : ndarray, shape (n_features,) | ||
The estimated coefficients of the linear estimator. | ||
scale : ndarray, shape (n_features,) | ||
The threshold values for each feature. | ||
Raises | ||
------ | ||
ValueError | ||
If the `linear_estimator` does not have a `coef_` attribute | ||
or a `best_estimator_` attribute with a `coef_` attribute. | ||
Notes | ||
----- | ||
The threshold is calculated as the standard deviation of the estimated | ||
coefficients multiplied by the square root of the number of features. | ||
This is based on the assumption that the coefficients follow a normal | ||
distribution with mean zero. | ||
""" | ||
_, n_features = X.shape | ||
|
||
linear_estimator.fit(X, y) | ||
|
||
if hasattr(linear_estimator, "coef_"): | ||
beta_hat = linear_estimator.coef_ | ||
elif hasattr(linear_estimator, "best_estimator_") and hasattr( | ||
linear_estimator.best_estimator_, "coef_" | ||
): | ||
beta_hat = linear_estimator.best_estimator_.coef_ # for CV object | ||
else: | ||
raise ValueError("linear estimator should be linear.") | ||
|
||
std = norm(beta_hat) / np.sqrt(n_features) | ||
scale = std * np.ones(beta_hat.size) | ||
|
||
return beta_hat, scale |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
""" | ||
Test the empirical thresholding module | ||
""" | ||
|
||
import pytest | ||
import numpy as np | ||
from numpy.testing import assert_almost_equal | ||
|
||
from hidimstat.scenario import multivariate_1D_simulation | ||
from hidimstat.empirical_thresholding import empirical_thresholding | ||
from hidimstat.stat_tools import pval_from_scale | ||
|
||
from sklearn.linear_model import Lasso | ||
from sklearn.tree import DecisionTreeRegressor | ||
|
||
|
||
def test_emperical_thresholding(): | ||
"""Testing the procedure on a simulation with no structure and a support | ||
of size 1. Computing one-sided p-values, we want a low p-value | ||
for the first feature and p-values close to 0.5 for the others.""" | ||
|
||
n_samples, n_features = 20, 50 | ||
support_size = 1 | ||
sigma = 0.1 | ||
rho = 0.0 | ||
|
||
X_init, y, beta, noise = multivariate_1D_simulation( | ||
n_samples=n_samples, | ||
n_features=n_features, | ||
support_size=support_size, | ||
sigma=sigma, | ||
rho=rho, | ||
shuffle=False, | ||
seed=3, | ||
) | ||
|
||
beta_hat, scale_hat = empirical_thresholding(X_init, y) | ||
|
||
pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) | ||
|
||
expected = 0.5 * np.ones(n_features) | ||
expected[:support_size] = 0.0 | ||
|
||
assert_almost_equal(pval_corr, expected, decimal=1) | ||
|
||
|
||
def test_emperical_thresholding_lasso(): | ||
"""Testing the procedure on a simulation with no structure and a support | ||
of size 1 with lasso.""" | ||
|
||
n_samples, n_features = 20, 50 | ||
support_size = 1 | ||
sigma = 0.1 | ||
rho = 0.0 | ||
|
||
X_init, y, beta, noise = multivariate_1D_simulation( | ||
n_samples=n_samples, | ||
n_features=n_features, | ||
support_size=support_size, | ||
sigma=sigma, | ||
rho=rho, | ||
shuffle=False, | ||
seed=3, | ||
) | ||
|
||
with pytest.raises(ValueError, match="linear estimator should be linear."): | ||
beta_hat, scale_hat = empirical_thresholding( | ||
X_init, y, linear_estimator=DecisionTreeRegressor() | ||
) | ||
|
||
beta_hat, scale_hat = empirical_thresholding(X_init, y, linear_estimator=Lasso()) | ||
|
||
pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) | ||
|
||
expected = 0.5 * np.ones(n_features) | ||
expected[:support_size] = 0.0 | ||
|
||
assert_almost_equal(pval_corr, expected, decimal=1) |
This file was deleted.
Oops, something went wrong.