Adding honest forest checks

Signed-off-by: Adam Li <[email protected]>
neurodata · Nov 2, 2023 · 6d914f6 · 6d914f6
1 parent bc2780e
commit 6d914f6
Show file tree

Hide file tree

Showing 15 changed files with 9,894 additions and 117 deletions.
diff --git a/benchmarks_nonasv/bench_multiview_hyppo.ipynb b/benchmarks_nonasv/bench_multiview_hyppo.ipynb
diff --git a/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_correlated_latentfactor_model.csv b/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_correlated_latentfactor_model.csv
diff --git a/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_ind_views_gaussian_mixture.csv b/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_ind_views_gaussian_mixture.csv
diff --git a/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_ind_views_gaussian_mixture_v2.csv b/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_ind_views_gaussian_mixture_v2.csv
diff --git a/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_linear_transform.csv b/benchmarks_nonasv/cv_partial_auc_mv_vs_rf_linear_transform.csv
diff --git a/doc/api.rst b/doc/api.rst
@@ -150,6 +150,19 @@ tree models.
    PermutationForestClassifier
    PermutationForestRegressor
 
+Datasets
+------------------------------
+We provide some convenience functions for simulating datasets beyond
+those offered in scikit-learn.
+
+.. currentmodule:: sktree.datasets
+.. autosummary::
+   :toctree: generated/
+
+   make_gaussian_mixture
+   make_joint_factor_model
+   make_quadratic_classification
+
 
 Experimental Functionality
 --------------------------

diff --git a/doc/references.bib b/doc/references.bib
@@ -146,4 +146,11 @@ @misc{perry2009crossvalidation
   eprint        = {0909.3052},
   archiveprefix = {arXiv},
   primaryclass  = {stat.ME}
+}
+
+@article{panda2018learning,
+  title   = {Learning Interpretable Characteristic Kernels via Decision Forests},
+  author  = {Panda, Sambit and Shen, Cencheng and Vogelstein, Joshua T},
+  journal = {arXiv preprint arXiv:1812.00029},
+  year    = {2018}
 }
diff --git a/sktree/datasets/__init__.py b/sktree/datasets/__init__.py
@@ -0,0 +1,2 @@
+from .hyppo import make_quadratic_classification
+from .multiview import make_gaussian_mixture, make_joint_factor_model
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
@@ -1,9 +1,11 @@
 import numpy as np
 
 
-def quadratic(n_samples: int, n_features: int, noise=False, seed=None):
+def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
     """Simulate classification data from a quadratic model.
 
+    This is a form of the simulation used in :footcite:`panda2018learning`.
+
     Parameters
     ----------
     n_samples : int
@@ -21,6 +23,10 @@ def quadratic(n_samples: int, n_features: int, noise=False, seed=None):
         Data array.
     v : array-like, shape (n_samples,)
         Target array of 1's and 0's.
+
+    References
+    ----------
+    .. footbibliography::
     """
     rng = np.random.default_rng(seed)
 
@@ -31,6 +37,7 @@ def quadratic(n_samples: int, n_features: int, noise=False, seed=None):
     x_coeffs = x * coeffs
     y = x_coeffs**2 + noise * eps
 
+    # generate the classification labels
     n1 = x.shape[0]
     n2 = y.shape[0]
     v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])

diff --git a/sktree/datasets/multiview.py b/sktree/datasets/multiview.py
@@ -1,5 +1,5 @@
+# Original source: https://github.com/mvlearn/mvlearn
 # License: MIT
-# Author: Ronan Perry
 
 import numpy as np
 from scipy.stats import ortho_group
@@ -339,7 +339,7 @@ def make_joint_factor_model(
     U = np.linalg.qr(U)[0]
 
     # random noise for each view
-    Es = [noise_std * rng.standard_normal(size=(n_samples, d)) for d in zip(n_features)]
+    Es = [noise_std * rng.standard_normal(size=(n_samples, d)) for d in n_features]
     Xs = [(U * svals) @ view_loadings[b].T + Es[b] for b in range(n_views)]
 
     if return_decomp:

diff --git a/sktree/experimental/mutual_info.py b/sktree/experimental/mutual_info.py
@@ -147,7 +147,7 @@ def mutual_info_ksg(
     algorithm="kd_tree",
     n_jobs: int = -1,
     transform: str = "rank",
-    random_seed: int = None,
+    random_seed: Optional[int] = None,
 ):
     """Compute the generalized (conditional) mutual information KSG estimate.
 

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
@@ -1,4 +1,4 @@
-from typing import Callable, Tuple, Union
+from typing import Optional, Callable, Tuple, Union
 
 import numpy as np
 from joblib import Parallel, delayed
@@ -43,7 +43,7 @@ def _parallel_build_trees_and_compute_posteriors(
     predict_posteriors: bool,
     permute_per_tree: bool,
     type_of_target,
-    sample_weight: ArrayLike = None,
+    sample_weight: Optional[ArrayLike] = None,
     class_weight=None,
     missing_values_in_feature_mask=None,
     classes=None,
@@ -255,7 +255,7 @@ def _statistic(
     ):
         raise NotImplementedError("Subclasses should implement this!")
 
-    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = None):
+    def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: Optional[ArrayLike] = None):
         X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True)
         if y.ndim != 2:
             y = y.reshape(-1, 1)
@@ -295,7 +295,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mi",
         return_posteriors: bool = False,
         check_input: bool = True,
@@ -414,7 +414,7 @@ def test(
         self,
         X,
         y,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric: str = "mi",
         n_repeats: int = 1000,
         return_posteriors: bool = True,
@@ -660,7 +660,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         check_input: bool = True,

diff --git a/sktree/stats/permutationforest.py b/sktree/stats/permutationforest.py
@@ -10,6 +10,7 @@
 from sktree._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier, ForestRegressor
 
 from .utils import METRIC_FUNCTIONS, REGRESSOR_METRICS, _compute_null_distribution_perm
+from typing import Optional
 
 
 class BasePermutationForest(MetaEstimatorMixin):
@@ -62,7 +63,7 @@ def _statistic(
         estimator: BaseForest,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         seed=None,
@@ -117,7 +118,7 @@ def statistic(
         self,
         X: ArrayLike,
         y: ArrayLike,
-        covariate_index: ArrayLike = None,
+        covariate_index: Optional[ArrayLike] = None,
         metric="mse",
         return_posteriors: bool = False,
         check_input: bool = True,

diff --git a/sktree/stats/utils.py b/sktree/stats/utils.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -112,7 +112,7 @@ def _compute_null_distribution_perm(
     est: ForestClassifier,
     metric: str = "mse",
     n_repeats: int = 1000,
-    seed: int = None,
+    seed: Optional[int] = None,
 ) -> ArrayLike:
     """Compute null distribution using permutation method.
 
@@ -173,7 +173,7 @@ def _compute_null_distribution_coleman(
     y_pred_proba_perm: ArrayLike,
     metric: str = "mse",
     n_repeats: int = 1000,
-    seed: int = None,
+    seed: Optional[int] = None,
 ) -> Tuple[ArrayLike, ArrayLike]:
     """Compute null distribution using Coleman method.
 

diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
@@ -9,7 +9,7 @@
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from sktree._lib.sklearn.tree import DecisionTreeClassifier
-from sktree.datasets.hyppo import quadratic
+from sktree.datasets.hyppo import make_quadratic_classification
 from sktree.ensemble import HonestForestClassifier
 from sktree.tree import ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier
 
@@ -262,7 +262,7 @@ def test_honest_forest_with_sklearn_trees():
     https://github.com/neurodata/scikit-tree/pull/157."""
 
     # generate the high-dimensional quadratic data
-    X, y = quadratic(1024, 4096, noise=True, seed=0)
+    X, y = make_quadratic_classification(1024, 4096, noise=True, seed=0)
     y = y.squeeze()
     print(X.shape, y.shape)
     print(np.sum(y) / len(y))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .hyppo import make_quadratic_classification
		from .multiview import make_gaussian_mixture, make_joint_factor_model