Skip to content

Commit

Permalink
Add datasets
Browse files Browse the repository at this point in the history
Signed-off-by: Adam Li <[email protected]>
  • Loading branch information
adam2392 committed Nov 9, 2023
1 parent e4728fa commit 5ac7e26
Show file tree
Hide file tree
Showing 7 changed files with 419 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/whats_new/v0.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Changelog
---------

- |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
- |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`160`)

Code and Documentation Contributors
-----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion sktree/_lib/sklearn_fork
Submodule sklearn_fork updated 123 files
2 changes: 2 additions & 0 deletions sktree/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .hyppo import make_quadratic_classification
from .multiview import make_gaussian_mixture, make_joint_factor_model
45 changes: 45 additions & 0 deletions sktree/datasets/hyppo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np


def make_quadratic_classification(n_samples: int, n_features: int, noise=False, seed=None):
"""Simulate classification data from a quadratic model.
This is a form of the simulation used in :footcite:`panda2018learning`.
Parameters
----------
n_samples : int
The number of samples to generate.
n_features : int
The number of dimensions in the dataset.
noise : bool, optional
Whether or not to add noise, by default False.
seed : int, optional
Random seed, by default None.
Returns
-------
x : array-like, shape (n_samples, n_features)
Data array.
v : array-like, shape (n_samples,)
Target array of 1's and 0's.
References
----------
.. footbibliography::
"""
rng = np.random.default_rng(seed)

x = rng.standard_normal(size=(n_samples, n_features))
coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(n_features)])
eps = rng.standard_normal(size=(n_samples, n_features))

x_coeffs = x * coeffs
y = x_coeffs**2 + noise * eps

# generate the classification labels
n1 = x.shape[0]
n2 = y.shape[0]
v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
x = np.vstack((x, y))
return x, v
11 changes: 11 additions & 0 deletions sktree/datasets/meson.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
python_sources = [
'__init__.py',
'multiview.py',
'hyppo.py',
]

py3.install_sources(
python_sources,
pure: false,
subdir: 'sktree/datasets'
)
Loading

0 comments on commit 5ac7e26

Please sign in to comment.