Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sparse Implementation of build_coleman_forest plus a fix for bottleneck import #317

Merged
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats_new/v0.10.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Changelog

- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)
- |Feature| Added a sparse implementation of `treeple.stats.forest.build_colemen_forest`
that uses the `scipy.sparse` module. By `Ryan Hausen`_ (:pr:`#317`)


Code and Documentation Contributors
Expand Down
160 changes: 124 additions & 36 deletions treeple/stats/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
from .._lib.sklearn.ensemble._forest import ForestClassifier
from ..tree._classes import DTYPE
from .permuteforest import PermutationHonestForestClassifier
from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman
from .utils import (
METRIC_FUNCTIONS,
POSITIVE_METRICS,
_compute_null_distribution_coleman,
_compute_null_distribution_coleman_sparse,
)


def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
Expand All @@ -31,6 +36,20 @@
return prediction


def _parallel_predict_proba_oob_sparse(predict_proba, X, test_idx):
"""
This is a utility function for joblib's Parallel.
It can't go locally in ForestClassifier or ForestRegressor, because joblib
complains that it cannot pickle it when placed there. Different from
_parallel_predict_proba_oob, this function returns the predictions and
indices for those values that are finite (not nan or inf).
"""
# each tree predicts proba with a list of output (n_samples, n_classes[i])
prediction = predict_proba(X[test_idx, :], check_input=False)
good_value_mask = np.isfinite(prediction[:, 0])
return test_idx[good_value_mask], prediction[good_value_mask]


ForestTestResult = namedtuple(
"ForestTestResult",
["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"],
Expand All @@ -48,6 +67,7 @@
verbose=False,
seed=None,
return_posteriors=True,
use_sparse=False,
**metric_kwargs,
):
"""Build a hypothesis testing forest using a two-forest approach.
Expand Down Expand Up @@ -86,6 +106,9 @@
Random seed, by default None.
return_posteriors : bool, optional
Whether or not to return the posteriors, by default True.
use_sparse : bool, optional
Whether or not to use a sparse for the calculation of the permutation
statistics, by default False. Doesn't affect return values.
**metric_kwargs : dict, optional
Additional keyword arguments to pass to the metric function.

Expand All @@ -112,36 +135,77 @@
metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]

# build two sets of forests
est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)
est, orig_forest_proba = build_oob_forest(est, X, y, use_sparse=use_sparse, verbose=verbose)

if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
)
perm_est, perm_forest_proba = build_oob_forest(
perm_est, X, y, verbose=verbose, covariate_index=covariate_index
perm_est,
X,
y,
use_sparse=use_sparse,
verbose=verbose,
covariate_index=covariate_index,
)

# get the number of jobs
n_jobs = est.n_jobs

if y.ndim == 1:
y = y.reshape(-1, 1)
metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
if use_sparse:
y_pred_proba_orig_perm, observe_stat, permute_stat, metric_star, metric_star_pi = (
_compute_null_distribution_coleman_sparse(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)
)

# if we are returning the posteriors, then we need to replace the
# sparse indices and values with an array. We convert the sparse data
# to dense data, so that the function returns results in a consistent format.
if return_posteriors:
n_trees = y_pred_proba_orig_perm.shape[0] // 2
n_samples = y_pred_proba_orig_perm.shape[1]

to_coords_data = lambda x: (x.row.astype(int), x.col.astype(int), x.data)

row, col, data = to_coords_data(y_pred_proba_orig_perm[:n_trees, :].tocoo())
orig_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
orig_forest_proba[row, col] = data

row, col, data = to_coords_data(y_pred_proba_orig_perm[n_trees:, :].tocoo())
perm_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
perm_forest_proba[row, col] = data
ryanhausen marked this conversation as resolved.
Show resolved Hide resolved

if y.shape[1] == 2:
orig_forest_proba = np.column_stack((orig_forest_proba, 1 - orig_forest_proba))
perm_forest_proba = np.column_stack((perm_forest_proba, 1 - perm_forest_proba))

Check warning on line 192 in treeple/stats/forest.py

View check run for this annotation

Codecov / codecov/patch

treeple/stats/forest.py#L191-L192

Added lines #L191 - L192 were not covered by tests
else:
metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)

# metric^\pi - metric = observed test statistic, which under the
# null is normally distributed around 0
Expand All @@ -165,7 +229,9 @@
return forest_result


def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
def build_oob_forest(
est: ForestClassifier, X, y, use_sparse: bool = False, verbose=False, **est_kwargs
):
"""Build a hypothesis testing forest using oob samples.

Parameters
Expand All @@ -179,6 +245,8 @@
Data.
y : ArrayLike of shape (n_samples, n_outputs)
Binary target, so ``n_outputs`` should be at most 1.
use_sparse : bool, optional
Whether or not to use a sparse representation for the posteriors.
verbose : bool, optional
Verbosity, by default False.
**est_kwargs : dict, optional
Expand Down Expand Up @@ -213,26 +281,46 @@
# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs)

# avoid storing the output of every estimator by summing them here
lock = threading.Lock()
# accumulate the predictions across all trees
all_proba = np.full(
(len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
)
if hasattr(est, "oob_samples_"):
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
if use_sparse:
if hasattr(est, "oob_samples_"):
oob_samples_list = est.oob_samples_
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])

Check warning on line 289 in treeple/stats/forest.py

View check run for this annotation

Codecov / codecov/patch

treeple/stats/forest.py#L288-L289

Added lines #L288 - L289 were not covered by tests
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]

oob_preds_test_idxs = Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob_sparse)(e.predict_proba, X, test_idx)
for e, test_idx in zip(est.estimators_, oob_samples_list)
)
all_proba = list(zip(*oob_preds_test_idxs))
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
# avoid storing the output of every estimator by summing them here
lock = threading.Lock()
# accumulate the predictions across all trees
all_proba = np.full(
(len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
)
if hasattr(est, "oob_samples_"):
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(
e.predict_proba, X, all_proba, idx, test_idx, lock
)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
)
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(
e.predict_proba, X, all_proba, idx, test_idx, lock
)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
)

return est, all_proba
27 changes: 17 additions & 10 deletions treeple/stats/tests/test_forest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib
import itertools
import os

import numpy as np
Expand Down Expand Up @@ -235,20 +236,24 @@ def test_comight_repeated_feature_sets(seed):
assert result.pvalue > 0.05, f"{result.pvalue}"


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_build_coleman_forest(use_bottleneck: bool):
@pytest.mark.parametrize(
("use_bottleneck", "use_sparse"),
itertools.product([True, False], [True, False]),
)
def test_build_coleman_forest(use_bottleneck: bool, use_sparse: bool):
"""Simple test for building a Coleman forest.

Test the function under alternative and null hypothesis for a very simple dataset.
"""
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
importlib.reload(stats)
else:
if not use_bottleneck and utils.DISABLE_BN_ENV_VAR not in os.environ:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)
importlib.reload(stats)
elif use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]

# We need to reload the modules after changing the environment variable
# because an environment variable is used to disable bottleneck
importlib.reload(utils)
importlib.reload(stats)
ryanhausen marked this conversation as resolved.
Show resolved Hide resolved

n_estimators = 100
n_samples = 30
Expand Down Expand Up @@ -290,7 +295,9 @@ def test_build_coleman_forest(use_bottleneck: bool):
perm_forest_proba,
clf_fitted,
perm_clf_fitted,
) = stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
) = stats.build_coleman_forest(
clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed, use_sparse=use_sparse
)
assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1))
Expand Down
21 changes: 21 additions & 0 deletions treeple/stats/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,24 @@ def test_non_nan_samples(use_bottleneck: bool):
expected = np.array([0, 2])
actual = utils._non_nan_samples(posterior_array)
np.testing.assert_array_equal(expected, actual)


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_nanmean_f(use_bottleneck: bool):
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)

posterior_array = np.array(
[
[1, 2, np.nan],
[3, 4, np.nan],
]
)

expected = np.array([1.5, 3.5])
actual = utils.nanmean_f(posterior_array, axis=1)
np.testing.assert_array_equal(expected, actual)
Loading
Loading