Skip to content

Commit

Permalink
Sparse Implementation of build_coleman_forest plus a fix for bottle…
Browse files Browse the repository at this point in the history
…neck import (#317)

* sparse implementation of build_coleman_forest. fixed call bottleneck check
* added a unit test for utils._get_forest_preds_sparse"
  • Loading branch information
ryanhausen authored Oct 11, 2024
1 parent fe6661d commit e1c38ad
Show file tree
Hide file tree
Showing 5 changed files with 438 additions and 55 deletions.
2 changes: 2 additions & 0 deletions doc/whats_new/v0.10.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Changelog

- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)
- |Feature| Added a sparse implementation of `treeple.stats.forest.build_colemen_forest`
that uses the `scipy.sparse` module. By `Ryan Hausen`_ (:pr:`#317`)


Code and Documentation Contributors
Expand Down
160 changes: 124 additions & 36 deletions treeple/stats/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from ..tree import MultiViewDecisionTreeClassifier
from ..tree._classes import DTYPE
from .permuteforest import PermutationHonestForestClassifier
from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman
from .utils import (
METRIC_FUNCTIONS,
POSITIVE_METRICS,
_compute_null_distribution_coleman,
_compute_null_distribution_coleman_sparse,
)


def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
Expand All @@ -34,6 +39,20 @@ def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
return prediction


def _parallel_predict_proba_oob_sparse(predict_proba, X, test_idx):
"""
This is a utility function for joblib's Parallel.
It can't go locally in ForestClassifier or ForestRegressor, because joblib
complains that it cannot pickle it when placed there. Different from
_parallel_predict_proba_oob, this function returns the predictions and
indices for those values that are finite (not nan or inf).
"""
# each tree predicts proba with a list of output (n_samples, n_classes[i])
prediction = predict_proba(X[test_idx, :], check_input=False)
good_value_mask = np.isfinite(prediction[:, 0])
return test_idx[good_value_mask], prediction[good_value_mask]


ForestTestResult = namedtuple(
"ForestTestResult",
["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"],
Expand All @@ -51,6 +70,7 @@ def build_coleman_forest(
verbose=False,
seed=None,
return_posteriors=True,
use_sparse=False,
**metric_kwargs,
):
"""Build a hypothesis testing forest using a two-forest approach.
Expand Down Expand Up @@ -89,6 +109,9 @@ def build_coleman_forest(
Random seed, by default None.
return_posteriors : bool, optional
Whether or not to return the posteriors, by default True.
use_sparse : bool, optional
Whether or not to use a sparse for the calculation of the permutation
statistics, by default False. Doesn't affect return values.
**metric_kwargs : dict, optional
Additional keyword arguments to pass to the metric function.
Expand Down Expand Up @@ -118,7 +141,7 @@ def build_coleman_forest(
raise RuntimeError(f"Original forest must be a HonestForestClassifier, got {type(est)}")

# build two sets of forests
est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)
est, orig_forest_proba = build_oob_forest(est, X, y, use_sparse=use_sparse, verbose=verbose)

if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
Expand All @@ -133,29 +156,70 @@ def build_coleman_forest(
)

perm_est, perm_forest_proba = build_oob_forest(
perm_est, X, y, verbose=verbose, covariate_index=covariate_index
perm_est,
X,
y,
use_sparse=use_sparse,
verbose=verbose,
covariate_index=covariate_index,
)

# get the number of jobs
n_jobs = est.n_jobs

if y.ndim == 1:
y = y.reshape(-1, 1)
metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
if use_sparse:
y_pred_proba_orig_perm, observe_stat, permute_stat, metric_star, metric_star_pi = (
_compute_null_distribution_coleman_sparse(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)
)

# if we are returning the posteriors, then we need to replace the
# sparse indices and values with an array. We convert the sparse data
# to dense data, so that the function returns results in a consistent format.
if return_posteriors:
n_trees = y_pred_proba_orig_perm.shape[0] // 2
n_samples = y_pred_proba_orig_perm.shape[1]

to_coords_data = lambda x: (x.row.astype(int), x.col.astype(int), x.data)

row, col, data = to_coords_data(y_pred_proba_orig_perm[:n_trees, :].tocoo())
orig_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
orig_forest_proba[row, col] = data

row, col, data = to_coords_data(y_pred_proba_orig_perm[n_trees:, :].tocoo())
perm_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
perm_forest_proba[row, col] = data

if y.shape[1] == 2:
orig_forest_proba = np.column_stack((orig_forest_proba, 1 - orig_forest_proba))
perm_forest_proba = np.column_stack((perm_forest_proba, 1 - perm_forest_proba))
else:
metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)

# metric^\pi - metric = observed test statistic, which under the
# null is normally distributed around 0
Expand All @@ -179,7 +243,9 @@ def build_coleman_forest(
return forest_result


def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
def build_oob_forest(
est: ForestClassifier, X, y, use_sparse: bool = False, verbose=False, **est_kwargs
):
"""Build a hypothesis testing forest using oob samples.
Parameters
Expand All @@ -193,6 +259,8 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
Data.
y : ArrayLike of shape (n_samples, n_outputs)
Binary target, so ``n_outputs`` should be at most 1.
use_sparse : bool, optional
Whether or not to use a sparse representation for the posteriors.
verbose : bool, optional
Verbosity, by default False.
**est_kwargs : dict, optional
Expand Down Expand Up @@ -227,26 +295,46 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs)

# avoid storing the output of every estimator by summing them here
lock = threading.Lock()
# accumulate the predictions across all trees
all_proba = np.full(
(len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
)
if hasattr(est, "oob_samples_"):
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
if use_sparse:
if hasattr(est, "oob_samples_"):
oob_samples_list = est.oob_samples_
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]

oob_preds_test_idxs = Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob_sparse)(e.predict_proba, X, test_idx)
for e, test_idx in zip(est.estimators_, oob_samples_list)
)
all_proba = list(zip(*oob_preds_test_idxs))
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
# avoid storing the output of every estimator by summing them here
lock = threading.Lock()
# accumulate the predictions across all trees
all_proba = np.full(
(len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
)
if hasattr(est, "oob_samples_"):
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(
e.predict_proba, X, all_proba, idx, test_idx, lock
)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
)
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(
e.predict_proba, X, all_proba, idx, test_idx, lock
)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
)

return est, all_proba
27 changes: 17 additions & 10 deletions treeple/stats/tests/test_forest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib
import itertools
import os

import numpy as np
Expand Down Expand Up @@ -235,20 +236,24 @@ def test_comight_repeated_feature_sets(seed):
assert result.pvalue > 0.05, f"{result.pvalue}"


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_build_coleman_forest(use_bottleneck: bool):
@pytest.mark.parametrize(
("use_bottleneck", "use_sparse"),
itertools.product([True, False], [True, False]),
)
def test_build_coleman_forest(use_bottleneck: bool, use_sparse: bool):
"""Simple test for building a Coleman forest.
Test the function under alternative and null hypothesis for a very simple dataset.
"""
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
importlib.reload(stats)
else:
if not use_bottleneck and utils.DISABLE_BN_ENV_VAR not in os.environ:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)
importlib.reload(stats)
elif use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]

# We need to reload the modules after changing the environment variable
# because an environment variable is used to disable bottleneck
importlib.reload(utils)
importlib.reload(stats)

n_estimators = 100
n_samples = 30
Expand Down Expand Up @@ -290,7 +295,9 @@ def test_build_coleman_forest(use_bottleneck: bool):
perm_forest_proba,
clf_fitted,
perm_clf_fitted,
) = stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
) = stats.build_coleman_forest(
clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed, use_sparse=use_sparse
)
assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1))
Expand Down
67 changes: 67 additions & 0 deletions treeple/stats/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pytest
import scipy.sparse as sp
from numpy.testing import assert_array_equal

import treeple.stats.utils as utils
Expand Down Expand Up @@ -67,3 +68,69 @@ def test_non_nan_samples(use_bottleneck: bool):
expected = np.array([0, 2])
actual = utils._non_nan_samples(posterior_array)
np.testing.assert_array_equal(expected, actual)


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_nanmean_f(use_bottleneck: bool):
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)

posterior_array = np.array(
[
[1, 2, np.nan],
[3, 4, np.nan],
]
)

expected = np.array([1.5, 3.5])
actual = utils.nanmean_f(posterior_array, axis=1)
np.testing.assert_array_equal(expected, actual)


@pytest.mark.parametrize(
("forest_indices", "expected"),
[
(np.arange(3), np.array([0.375, 0.75, 0.25])),
(np.arange(3) + 2, np.array([0.10, 0.05, 0.25])),
(np.arange(3) + 3, np.array([0.10, 0.45, np.nan])),
],
)
def test_get_forest_preds_sparse(
forest_indices,
expected,
):

all_y_pred = sp.csc_matrix(
np.array(
[
[0.50, 0.00, 0.00],
[0.25, 0.75, 0.00],
[0.00, 0.00, 0.25],
[0.10, 0.00, 0.00],
[0.00, 0.05, 0.00],
[0.00, 0.85, 0.00],
]
)
)

all_y_indicator = sp.csc_matrix(
np.array(
[
[1, 0, 0],
[1, 1, 0],
[0, 0, 1],
[1, 0, 0],
[0, 1, 0],
[0, 1, 0],
]
)
)

np.testing.assert_array_equal(
utils._get_forest_preds_sparse(all_y_pred, all_y_indicator, forest_indices),
expected,
)
Loading

0 comments on commit e1c38ad

Please sign in to comment.