Add datasets

Signed-off-by: Adam Li <[email protected]>
neurodata · Nov 9, 2023 · 5ac7e26 · 5ac7e26
1 parent e4728fa
commit 5ac7e26
Show file tree

Hide file tree

Showing 7 changed files with 419 additions and 1 deletion.
diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst
@@ -14,6 +14,7 @@ Changelog
 ---------
 
 - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
+- |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`160`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork
diff --git a/sktree/datasets/__init__.py b/sktree/datasets/__init__.py
@@ -0,0 +1,2 @@
+from .hyppo import make_quadratic_classification
+from .multiview import make_gaussian_mixture, make_joint_factor_model
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+
+def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
+    """Simulate classification data from a quadratic model.
+
+    This is a form of the simulation used in :footcite:`panda2018learning`.
+
+    Parameters
+    ----------
+    n_samples : int
+        The number of samples to generate.
+    n_features : int
+        The number of dimensions in the dataset.
+    noise : bool, optional
+        Whether or not to add noise, by default False.
+    seed : int, optional
+        Random seed, by default None.
+
+    Returns
+    -------
+    x : array-like, shape (n_samples, n_features)
+        Data array.
+    v : array-like, shape (n_samples,)
+        Target array of 1's and 0's.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    rng = np.random.default_rng(seed)
+
+    x = rng.standard_normal(size=(n_samples, n_features))
+    coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)])
+    eps = rng.standard_normal(size=(n_samples, n_features))
+
+    x_coeffs = x * coeffs
+    y = x_coeffs**2 + noise * eps
+
+    # generate the classification labels
+    n1 = x.shape[0]
+    n2 = y.shape[0]
+    v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
+    x = np.vstack((x, y))
+    return x, v
diff --git a/sktree/datasets/meson.build b/sktree/datasets/meson.build
@@ -0,0 +1,11 @@
+python_sources = [
+  '__init__.py',
+  'multiview.py',
+  'hyppo.py',
+]
+
+py3.install_sources(
+  python_sources,
+  pure: false,
+  subdir: 'sktree/datasets'
+)
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,7 @@ Changelog @@
     ---------
     - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
+    - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`160`)
     Code and Documentation Contributors
     -----------------------------------
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .hyppo import make_quadratic_classification
		from .multiview import make_gaussian_mixture, make_joint_factor_model