diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 40eab48..b2500ce 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -5,7 +5,7 @@ use rustrees::Dataset; use rustrees::DecisionTree; fn decision_tree_housing(train: &Dataset, test: &Dataset) { - let dt = DecisionTree::train_reg(train, 5, Some(1), Some(42)); + let dt = DecisionTree::train_reg(train, Some(5), Some(1), None, Some(42)); if train.n_samples() <= 1 { let pred = dt.predict(&test); println!("R2: {}", r2(&test.target_vector, &pred)); @@ -42,7 +42,7 @@ fn criterion_benchmark(c: &mut Criterion) { // benchmark prediction let pred_name = "predict_decision_tree_".to_string() + dataset; - let dt = DecisionTree::train_reg(&train, 5, Some(1), Some(42)); + let dt = DecisionTree::train_reg(&train, Some(5), Some(1), None, Some(42)); c.bench_function(&pred_name, |b| { b.iter(|| predict_decision_tree_housing(&dt, &test)) }); diff --git a/python/rustrees/decision_tree.py b/python/rustrees/decision_tree.py index dd15044..0412ba6 100644 --- a/python/rustrees/decision_tree.py +++ b/python/rustrees/decision_tree.py @@ -13,7 +13,13 @@ class DecisionTree(BaseEstimator): Options for regression and classification are available. """ - def __init__(self, min_samples_leaf=1, max_depth: int = 10, random_state=None): + def __init__( + self, + min_samples_leaf=1, + max_depth: int = 10, + max_features: int = None, + random_state=None, + ): """ Parameters ---------- @@ -21,11 +27,14 @@ def __init__(self, min_samples_leaf=1, max_depth: int = 10, random_state=None): The minimum number of samples required to be at a leaf node. The default is 1. max_depth : int, optional The maximum depth of the tree. The default is 10. + max_features: int, optional + The maximum number of features per split. Default is None, which means all features are considered. random_state : int, optional The seed used by the random number generator. The default is None. """ self.min_samples_leaf = min_samples_leaf self.max_depth = max_depth + self.max_features = max_features self.random_state = random_state def fit(self, X, y): @@ -84,6 +93,7 @@ def fit(self, X, y) -> "DecisionTreeRegressor": dataset, min_samples_leaf=self.min_samples_leaf, max_depth=self.max_depth, + max_features=self.max_features, random_state=self.random_state, ) return self @@ -103,6 +113,7 @@ def fit(self, X, y) -> "DecisionTreeClassifier": dataset, min_samples_leaf=self.min_samples_leaf, max_depth=self.max_depth, + max_features=self.max_features, random_state=self.random_state, ) return self diff --git a/python/rustrees/random_forest.py b/python/rustrees/random_forest.py index 1405fc0..af8722d 100644 --- a/python/rustrees/random_forest.py +++ b/python/rustrees/random_forest.py @@ -15,26 +15,30 @@ class RandomForest(BaseEstimator): def __init__( self, + n_estimators: int = 100, min_samples_leaf=1, max_depth: int = 10, - n_estimators: int = 100, + max_features: int = None, random_state=None, ): """ Parameters ---------- + n_estimators : int, optional + The number of trees in the forest. The default is 100. min_samples_leaf : int, optional The minimum number of samples required to be at a leaf node. The default is 1. - max_depth : int, optional + max_depth : int, optional The maximum depth of the tree. The default is 10. - n_estimators : int, optional - The number of trees in the forest. The default is 100. - random_state : int, optional + max_features: int, optional + The maximum number of features per split. Default is None, which means all features are considered. + random_state : int, optional The seed used by the random number generator. The default is None. """ + self.n_estimators = n_estimators self.min_samples_leaf = min_samples_leaf self.max_depth = max_depth - self.n_estimators = n_estimators + self.max_features = max_features self.random_state = random_state def fit(self, X, y): @@ -91,9 +95,10 @@ def fit(self, X, y) -> "RandomForestRegressor": dataset = prepare_dataset(X, y) self.forest = rt_dt.train_reg( dataset, + n_estimators=self.n_estimators, min_samples_leaf=self.min_samples_leaf, max_depth=self.max_depth, - n_estimators=self.n_estimators, + max_features=self.max_features, random_state=self.random_state, ) return self @@ -111,9 +116,10 @@ def fit(self, X, y) -> "RandomForestClassifier": dataset = prepare_dataset(X, y) self.forest = rt_dt.train_clf( dataset, + n_estimators=self.n_estimators, min_samples_leaf=self.min_samples_leaf, max_depth=self.max_depth, - n_estimators=self.n_estimators, + max_features=self.max_features, random_state=self.random_state, ) return self diff --git a/src/dataset.rs b/src/dataset.rs index fab58d1..3add5de 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -1,13 +1,13 @@ -use pyo3::prelude::*; -use rand::{rngs::StdRng, Rng}; -use std::fs; -use arrow::array::{Float32Array}; -use arrow::record_batch::RecordBatch; +use arrow::array::Float32Array; use arrow::compute::cast; use arrow::csv; -use std::fs::File; use arrow::datatypes::DataType; use arrow::pyarrow::PyArrowConvert; +use arrow::record_batch::RecordBatch; +use pyo3::prelude::*; +use rand::{rngs::StdRng, Rng}; +use std::fs; +use std::fs::File; use pyo3::types::PyAny; @@ -22,10 +22,9 @@ pub struct Dataset { } impl Dataset { - fn _from_pyarrow(df: &PyAny) -> Dataset { let batch = RecordBatch::from_pyarrow(df).unwrap(); - + let feature_names = batch .schema() .fields() @@ -41,22 +40,22 @@ impl Dataset { feature_matrix: feature_matrix[0..feature_matrix.len() - 1].to_vec(), target_name: feature_names.last().unwrap().to_string(), target_vector: feature_matrix.last().unwrap().to_vec(), - } + } } - fn _read_batch(batch: RecordBatch) -> Vec>{ + fn _read_batch(batch: RecordBatch) -> Vec> { batch - .columns() - .iter() - .map(|c| cast(c, &DataType::Float32).unwrap()) - .map(|c| { - c.as_any() - .downcast_ref::() - .unwrap() - .values() - .to_vec() - }) - .collect::>() + .columns() + .iter() + .map(|c| cast(c, &DataType::Float32).unwrap()) + .map(|c| { + c.as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect::>() } fn _read_csv(path: &str, sep: &str) -> Dataset { @@ -100,7 +99,7 @@ impl Dataset { feature_uniform: vec![false; self.feature_names.len()], feature_matrix: vec![], target_name: self.target_name.clone(), - target_vector: vec![] + target_vector: vec![], } } @@ -179,7 +178,6 @@ mod tests { target_vector: vec![1.0, 0.0], }; - assert_eq!(expected, got); } diff --git a/src/tests.rs b/src/tests.rs index 1c949d7..9a7bc93 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -12,7 +12,7 @@ mod tests { fn test_integration() { let train = Dataset::read_csv("datasets/diabetes_train.csv", ","); let test = Dataset::read_csv("datasets/diabetes_test.csv", ","); - let dt = DecisionTree::train_reg(&train, 5, Some(1), Some(42)); + let dt = DecisionTree::train_reg(&train, Some(5), Some(1), None, Some(42)); let mut pred = test.clone(); dt.predict(&mut pred); assert_eq!(r2(&test.target_vector, &pred.target_vector) > 0.28, true); @@ -21,7 +21,7 @@ mod tests { #[test] fn decision_tree_titanic() { let (train, test) = read_train_test_dataset("titanic"); - let dt = DecisionTree::train_clf(&train, 5, Some(1), Some(43)); + let dt = DecisionTree::train_clf(&train, Some(5), Some(1), None, Some(43)); let pred = dt.predict(&test); println!("Accuracy: {}", accuracy(&test.target_vector, &pred)); assert_greater_than(accuracy(&test.target_vector, &pred), 0.237); @@ -30,7 +30,7 @@ mod tests { #[test] fn decision_tree_breast_cancer() { let (train, test) = read_train_test_dataset("breast_cancer"); - let dt = DecisionTree::train_clf(&train, 5, Some(1), Some(42)); + let dt = DecisionTree::train_clf(&train, Some(5), Some(1), None, Some(42)); let pred = dt.predict(&test); println!("Accuracy: {}", accuracy(&test.target_vector, &pred)); assert_greater_than(accuracy(&test.target_vector, &pred), 0.83); @@ -39,7 +39,7 @@ mod tests { #[test] fn decision_tree_housing() { let (train, test) = read_train_test_dataset("housing"); - let dt = DecisionTree::train_reg(&train, 5, Some(1), Some(42)); + let dt = DecisionTree::train_reg(&train, Some(5), Some(1), None, Some(42)); let pred = dt.predict(&test); println!("R2: {}", r2(&test.target_vector, &pred)); assert_greater_than(r2(&test.target_vector, &pred), 0.59); @@ -48,7 +48,7 @@ mod tests { #[test] fn decision_tree_diabeties() { let (train, test) = read_train_test_dataset("diabetes"); - let dt = DecisionTree::train_reg(&train, 5, Some(1), Some(42)); + let dt = DecisionTree::train_reg(&train, Some(5), Some(1), None, Some(42)); let pred = dt.predict(&test); println!("R2: {}", r2(&test.target_vector, &pred)); assert_greater_than(r2(&test.target_vector, &pred), 0.30); @@ -64,11 +64,10 @@ mod tests { (train, test) } - #[test] fn random_forest_diabetes() { let (train, test) = read_train_test_dataset("diabetes"); - let rf = RandomForest::train_reg(&train, 10, Some(5), Some(1), Some(42)); + let rf = RandomForest::train_reg(&train, 10, Some(5), Some(1), None, Some(42)); let pred = rf.predict(&test); println!("R2: {}", r2(&test.target_vector, &pred)); assert_greater_than(r2(&test.target_vector, &pred), 0.38); @@ -77,7 +76,7 @@ mod tests { #[test] fn random_forest_housing() { let (train, test) = read_train_test_dataset("housing"); - let rf = RandomForest::train_reg(&train, 10, Some(5), Some(1), Some(42)); + let rf = RandomForest::train_reg(&train, 10, Some(5), Some(1), None, Some(42)); let pred = rf.predict(&test); println!("R2: {}", r2(&test.target_vector, &pred)); assert_greater_than(r2(&test.target_vector, &pred), 0.641); @@ -86,19 +85,18 @@ mod tests { #[test] fn random_forest_breast_cancer() { let (train, test) = read_train_test_dataset("breast_cancer"); - let rf = RandomForest::train_clf(&train, 10, Some(5), Some(1), Some(42)); + let rf = RandomForest::train_clf(&train, 10, Some(5), Some(1), None, Some(42)); let pred = rf.predict(&test); let pred = classification_threshold(&pred, 0.5); println!("Accuracy: {}", accuracy(&test.target_vector, &pred),); assert_greater_than(accuracy(&test.target_vector, &pred), 0.96); - } #[test] fn random_forest_breast_titanic() { let (train, test) = read_train_test_dataset("titanic"); - let rf = RandomForest::train_clf(&train, 10, Some(5), Some(1), Some(42)); + let rf = RandomForest::train_clf(&train, 10, Some(5), Some(1), None, Some(42)); let pred = rf.predict(&test); let pred = classification_threshold(&pred, 0.5); diff --git a/src/trees.rs b/src/trees.rs index bec7536..a7a4480 100644 --- a/src/trees.rs +++ b/src/trees.rs @@ -28,13 +28,15 @@ pub struct RandomForest { pub struct TrainOptions { min_samples_leaf: i32, max_depth: i32, + max_features: i32, } impl TrainOptions { - pub fn default_options() -> TrainOptions { + pub fn default_options(num_features: i32) -> TrainOptions { TrainOptions { max_depth: 10, min_samples_leaf: 1, + max_features: num_features, } } } @@ -47,12 +49,14 @@ impl RandomForest { n_estimators: i32, max_depth: Option, min_samples_leaf: Option, + max_features: Option, random_state: Option, ) -> RandomForest { + let default_train_options = TrainOptions::default_options(train.feature_names.len() as i32); let params = TrainOptions { - max_depth: max_depth.unwrap_or(TrainOptions::default_options().max_depth), - min_samples_leaf: min_samples_leaf - .unwrap_or(TrainOptions::default_options().min_samples_leaf), + max_depth: max_depth.unwrap_or(default_train_options.max_depth), + min_samples_leaf: min_samples_leaf.unwrap_or(default_train_options.min_samples_leaf), + max_features: max_features.unwrap_or(default_train_options.max_features), }; let trees: Vec = (0..n_estimators) @@ -79,14 +83,15 @@ impl RandomForest { n_estimators: i32, max_depth: Option, min_samples_leaf: Option, + max_features: Option, random_state: Option, ) -> RandomForest { + let default_train_options = TrainOptions::default_options(train.feature_names.len() as i32); let params = TrainOptions { - max_depth: max_depth.unwrap_or(TrainOptions::default_options().max_depth), - min_samples_leaf: min_samples_leaf - .unwrap_or(TrainOptions::default_options().min_samples_leaf), + max_depth: max_depth.unwrap_or(default_train_options.max_depth), + min_samples_leaf: min_samples_leaf.unwrap_or(default_train_options.min_samples_leaf), + max_features: max_features.unwrap_or(default_train_options.max_features), }; - let trees: Vec = (0..n_estimators) .into_par_iter() .map(|i| { @@ -130,23 +135,24 @@ impl RandomForest { } } - - #[pymethods] impl DecisionTree { #[staticmethod] pub fn train_reg( train: &Dataset, - max_depth: i32, + max_depth: Option, min_samples_leaf: Option, + max_features: Option, random_state: Option, ) -> DecisionTree { let mut rng = utils::get_rng(random_state, 0); + let default_train_options = TrainOptions::default_options(train.feature_names.len() as i32); let params = TrainOptions { - max_depth, - min_samples_leaf: min_samples_leaf - .unwrap_or(TrainOptions::default_options().min_samples_leaf), + max_depth: max_depth.unwrap_or(default_train_options.max_depth), + min_samples_leaf: min_samples_leaf.unwrap_or(default_train_options.min_samples_leaf), + max_features: max_features.unwrap_or(default_train_options.max_features), }; + DecisionTree { tree: Tree::fit( &train, @@ -161,15 +167,17 @@ impl DecisionTree { #[staticmethod] pub fn train_clf( train: &Dataset, - max_depth: i32, + max_depth: Option, min_samples_leaf: Option, + max_features: Option, random_state: Option, ) -> DecisionTree { let mut rng = utils::get_rng(random_state, 0); + let default_train_options = TrainOptions::default_options(train.feature_names.len() as i32); let params = TrainOptions { - max_depth, - min_samples_leaf: min_samples_leaf - .unwrap_or(TrainOptions::default_options().min_samples_leaf), + max_depth: max_depth.unwrap_or(default_train_options.max_depth), + min_samples_leaf: min_samples_leaf.unwrap_or(default_train_options.min_samples_leaf), + max_features: max_features.unwrap_or(default_train_options.max_features), }; DecisionTree { tree: Tree::fit(&train, 0, params, gini_coefficient_split_feature, &mut rng), @@ -429,8 +437,10 @@ impl Tree { let mut best_feature = SplitResult::new_max_loss(); let mut feature_indexes = (0..train.feature_names.len()).collect::>(); feature_indexes.shuffle(rng); + let max_features = train_options.max_features; + let selected_feature_indexes = feature_indexes[0..max_features as usize].to_vec(); - for i in feature_indexes { + for i in selected_feature_indexes { if train.feature_uniform[i] { continue; }