Sparse Implementation of build_coleman_forest plus a fix for bottle…

…neck import (#317) * sparse implementation of build_coleman_forest. fixed call bottleneck check * added a unit test for utils._get_forest_preds_sparse"
neurodata · Oct 11, 2024 · e1c38ad · e1c38ad
1 parent fe6661d
commit e1c38ad
Show file tree

Hide file tree

Showing 5 changed files with 438 additions and 55 deletions.
diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst
@@ -15,6 +15,8 @@ Changelog
 
 - |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
     ``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)
+- |Feature| Added a sparse implementation of `treeple.stats.forest.build_colemen_forest`
+    that uses the `scipy.sparse` module. By `Ryan Hausen`_ (:pr:`#317`)
 
 
 Code and Documentation Contributors

diff --git a/treeple/stats/forest.py b/treeple/stats/forest.py
@@ -15,7 +15,12 @@
 from ..tree import MultiViewDecisionTreeClassifier
 from ..tree._classes import DTYPE
 from .permuteforest import PermutationHonestForestClassifier
-from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman
+from .utils import (
+    METRIC_FUNCTIONS,
+    POSITIVE_METRICS,
+    _compute_null_distribution_coleman,
+    _compute_null_distribution_coleman_sparse,
+)
 
 
 def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
@@ -34,6 +39,20 @@ def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
     return prediction
 
 
+def _parallel_predict_proba_oob_sparse(predict_proba, X, test_idx):
+    """
+    This is a utility function for joblib's Parallel.
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there. Different from
+    _parallel_predict_proba_oob, this function returns the predictions and
+    indices for those values that are finite (not nan or inf).
+    """
+    # each tree predicts proba with a list of output (n_samples, n_classes[i])
+    prediction = predict_proba(X[test_idx, :], check_input=False)
+    good_value_mask = np.isfinite(prediction[:, 0])
+    return test_idx[good_value_mask], prediction[good_value_mask]
+
+
 ForestTestResult = namedtuple(
     "ForestTestResult",
     ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"],
@@ -51,6 +70,7 @@ def build_coleman_forest(
     verbose=False,
     seed=None,
     return_posteriors=True,
+    use_sparse=False,
     **metric_kwargs,
 ):
     """Build a hypothesis testing forest using a two-forest approach.
@@ -89,6 +109,9 @@ def build_coleman_forest(
         Random seed, by default None.
     return_posteriors : bool, optional
         Whether or not to return the posteriors, by default True.
+    use_sparse : bool, optional
+        Whether or not to use a sparse for the calculation of the permutation
+        statistics, by default False. Doesn't affect return values.
     **metric_kwargs : dict, optional
         Additional keyword arguments to pass to the metric function.
 
@@ -118,7 +141,7 @@ def build_coleman_forest(
         raise RuntimeError(f"Original forest must be a HonestForestClassifier, got {type(est)}")
 
     # build two sets of forests
-    est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)
+    est, orig_forest_proba = build_oob_forest(est, X, y, use_sparse=use_sparse, verbose=verbose)
 
     if not isinstance(perm_est, PermutationHonestForestClassifier):
         raise RuntimeError(
@@ -133,29 +156,70 @@ def build_coleman_forest(
         )
 
     perm_est, perm_forest_proba = build_oob_forest(
-        perm_est, X, y, verbose=verbose, covariate_index=covariate_index
+        perm_est,
+        X,
+        y,
+        use_sparse=use_sparse,
+        verbose=verbose,
+        covariate_index=covariate_index,
     )
 
     # get the number of jobs
     n_jobs = est.n_jobs
 
     if y.ndim == 1:
         y = y.reshape(-1, 1)
-    metric_star, metric_star_pi = _compute_null_distribution_coleman(
-        y,
-        orig_forest_proba,
-        perm_forest_proba,
-        metric,
-        n_repeats=n_repeats,
-        seed=seed,
-        n_jobs=n_jobs,
-        **metric_kwargs,
-    )
 
-    y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
-    y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
-    observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
-    permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
+    if use_sparse:
+        y_pred_proba_orig_perm, observe_stat, permute_stat, metric_star, metric_star_pi = (
+            _compute_null_distribution_coleman_sparse(
+                y,
+                orig_forest_proba,
+                perm_forest_proba,
+                metric,
+                n_repeats=n_repeats,
+                seed=seed,
+                n_jobs=n_jobs,
+                **metric_kwargs,
+            )
+        )
+
+        # if we are returning the posteriors, then we need to replace the
+        # sparse indices and values with an array. We convert the sparse data
+        # to dense data, so that the function returns results in a consistent format.
+        if return_posteriors:
+            n_trees = y_pred_proba_orig_perm.shape[0] // 2
+            n_samples = y_pred_proba_orig_perm.shape[1]
+
+            to_coords_data = lambda x: (x.row.astype(int), x.col.astype(int), x.data)
+
+            row, col, data = to_coords_data(y_pred_proba_orig_perm[:n_trees, :].tocoo())
+            orig_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
+            orig_forest_proba[row, col] = data
+
+            row, col, data = to_coords_data(y_pred_proba_orig_perm[n_trees:, :].tocoo())
+            perm_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
+            perm_forest_proba[row, col] = data
+
+            if y.shape[1] == 2:
+                orig_forest_proba = np.column_stack((orig_forest_proba, 1 - orig_forest_proba))
+                perm_forest_proba = np.column_stack((perm_forest_proba, 1 - perm_forest_proba))
+    else:
+        metric_star, metric_star_pi = _compute_null_distribution_coleman(
+            y,
+            orig_forest_proba,
+            perm_forest_proba,
+            metric,
+            n_repeats=n_repeats,
+            seed=seed,
+            n_jobs=n_jobs,
+            **metric_kwargs,
+        )
+
+        y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
+        y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
+        observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
+        permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
 
     # metric^\pi - metric = observed test statistic, which under the
     # null is normally distributed around 0
@@ -179,7 +243,9 @@ def build_coleman_forest(
         return forest_result
 
 
-def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
+def build_oob_forest(
+    est: ForestClassifier, X, y, use_sparse: bool = False, verbose=False, **est_kwargs
+):
     """Build a hypothesis testing forest using oob samples.
 
     Parameters
@@ -193,6 +259,8 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
         Data.
     y : ArrayLike of shape (n_samples, n_outputs)
         Binary target, so ``n_outputs`` should be at most 1.
+    use_sparse : bool, optional
+        Whether or not to use a sparse representation for the posteriors.
     verbose : bool, optional
         Verbosity, by default False.
     **est_kwargs : dict, optional
@@ -227,26 +295,46 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
     # Assign chunk of trees to jobs
     n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs)
 
-    # avoid storing the output of every estimator by summing them here
-    lock = threading.Lock()
-    # accumulate the predictions across all trees
-    all_proba = np.full(
-        (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
-    )
-    if hasattr(est, "oob_samples_"):
-        Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
-            delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
-            for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
+    if use_sparse:
+        if hasattr(est, "oob_samples_"):
+            oob_samples_list = est.oob_samples_
+        else:
+            inbag_samples = est.estimators_samples_
+            all_samples = np.arange(X.shape[0])
+            oob_samples_list = [
+                np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
+            ]
+
+        oob_preds_test_idxs = Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
+            delayed(_parallel_predict_proba_oob_sparse)(e.predict_proba, X, test_idx)
+            for e, test_idx in zip(est.estimators_, oob_samples_list)
         )
+        all_proba = list(zip(*oob_preds_test_idxs))
     else:
-        inbag_samples = est.estimators_samples_
-        all_samples = np.arange(X.shape[0])
-        oob_samples_list = [
-            np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
-        ]
-        Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
-            delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
-            for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
+        # avoid storing the output of every estimator by summing them here
+        lock = threading.Lock()
+        # accumulate the predictions across all trees
+        all_proba = np.full(
+            (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
         )
+        if hasattr(est, "oob_samples_"):
+            Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
+                delayed(_parallel_predict_proba_oob)(
+                    e.predict_proba, X, all_proba, idx, test_idx, lock
+                )
+                for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
+            )
+        else:
+            inbag_samples = est.estimators_samples_
+            all_samples = np.arange(X.shape[0])
+            oob_samples_list = [
+                np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
+            ]
+            Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
+                delayed(_parallel_predict_proba_oob)(
+                    e.predict_proba, X, all_proba, idx, test_idx, lock
+                )
+                for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
+            )
 
     return est, all_proba
diff --git a/treeple/stats/tests/test_forest.py b/treeple/stats/tests/test_forest.py
@@ -1,4 +1,5 @@
 import importlib
+import itertools
 import os
 
 import numpy as np
@@ -235,20 +236,24 @@ def test_comight_repeated_feature_sets(seed):
     assert result.pvalue > 0.05, f"{result.pvalue}"
 
 
-@pytest.mark.parametrize("use_bottleneck", [True, False])
-def test_build_coleman_forest(use_bottleneck: bool):
+@pytest.mark.parametrize(
+    ("use_bottleneck", "use_sparse"),
+    itertools.product([True, False], [True, False]),
+)
+def test_build_coleman_forest(use_bottleneck: bool, use_sparse: bool):
     """Simple test for building a Coleman forest.
 
     Test the function under alternative and null hypothesis for a very simple dataset.
     """
-    if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
-        del os.environ[utils.DISABLE_BN_ENV_VAR]
-        importlib.reload(utils)
-        importlib.reload(stats)
-    else:
+    if not use_bottleneck and utils.DISABLE_BN_ENV_VAR not in os.environ:
         os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
-        importlib.reload(utils)
-        importlib.reload(stats)
+    elif use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
+        del os.environ[utils.DISABLE_BN_ENV_VAR]
+
+    # We need to reload the modules after changing the environment variable
+    # because an environment variable is used to disable bottleneck
+    importlib.reload(utils)
+    importlib.reload(stats)
 
     n_estimators = 100
     n_samples = 30
@@ -290,7 +295,9 @@ def test_build_coleman_forest(use_bottleneck: bool):
         perm_forest_proba,
         clf_fitted,
         perm_clf_fitted,
-    ) = stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
+    ) = stats.build_coleman_forest(
+        clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed, use_sparse=use_sparse
+    )
     assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
     assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
     assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1))

diff --git a/treeple/stats/tests/test_utils.py b/treeple/stats/tests/test_utils.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pytest
+import scipy.sparse as sp
 from numpy.testing import assert_array_equal
 
 import treeple.stats.utils as utils
@@ -67,3 +68,69 @@ def test_non_nan_samples(use_bottleneck: bool):
     expected = np.array([0, 2])
     actual = utils._non_nan_samples(posterior_array)
     np.testing.assert_array_equal(expected, actual)
+
+
+@pytest.mark.parametrize("use_bottleneck", [True, False])
+def test_nanmean_f(use_bottleneck: bool):
+    if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
+        del os.environ[utils.DISABLE_BN_ENV_VAR]
+        importlib.reload(utils)
+    else:
+        os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
+        importlib.reload(utils)
+
+    posterior_array = np.array(
+        [
+            [1, 2, np.nan],
+            [3, 4, np.nan],
+        ]
+    )
+
+    expected = np.array([1.5, 3.5])
+    actual = utils.nanmean_f(posterior_array, axis=1)
+    np.testing.assert_array_equal(expected, actual)
+
+
+@pytest.mark.parametrize(
+    ("forest_indices", "expected"),
+    [
+        (np.arange(3), np.array([0.375, 0.75, 0.25])),
+        (np.arange(3) + 2, np.array([0.10, 0.05, 0.25])),
+        (np.arange(3) + 3, np.array([0.10, 0.45, np.nan])),
+    ],
+)
+def test_get_forest_preds_sparse(
+    forest_indices,
+    expected,
+):
+
+    all_y_pred = sp.csc_matrix(
+        np.array(
+            [
+                [0.50, 0.00, 0.00],
+                [0.25, 0.75, 0.00],
+                [0.00, 0.00, 0.25],
+                [0.10, 0.00, 0.00],
+                [0.00, 0.05, 0.00],
+                [0.00, 0.85, 0.00],
+            ]
+        )
+    )
+
+    all_y_indicator = sp.csc_matrix(
+        np.array(
+            [
+                [1, 0, 0],
+                [1, 1, 0],
+                [0, 0, 1],
+                [1, 0, 0],
+                [0, 1, 0],
+                [0, 1, 0],
+            ]
+        )
+    )
+
+    np.testing.assert_array_equal(
+        utils._get_forest_preds_sparse(all_y_pred, all_y_indicator, forest_indices),
+        expected,
+    )