From 31218b0f33e3a5f6905341d734a2adafa088e53c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 28 Feb 2024 10:50:25 -0500 Subject: [PATCH 01/27] draft Signed-off-by: Adam Li --- sktree/tree/_oblique_splitter.pxd | 8 +- sktree/tree/_oblique_splitter.pyx | 228 ++++-------------------- sktree/tree/_oblique_tree.pyx | 10 +- sktree/tree/manifold/_morf_splitter.pyx | 11 +- sktree/tree/tests/meson.build | 1 + sktree/tree/tests/test_oblique_trees.py | 34 ++++ test_tree.py | 32 ++++ 7 files changed, 111 insertions(+), 213 deletions(-) create mode 100644 sktree/tree/tests/test_oblique_trees.py create mode 100644 test_tree.py diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 15feaea22..359a072b1 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -31,8 +31,8 @@ cdef struct ObliqueSplitRecord: float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. - vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t] proj_vec_weights # weights of the vector (max_features,) + vector[intp_t] proj_vec_indices # indices of the features (max_features,) cdef class BaseObliqueSplitter(Splitter): @@ -72,8 +72,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil cdef int node_split( diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index aee43b1ce..373f7074d 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -100,8 +100,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -115,8 +115,8 @@ cdef class BaseObliqueSplitter(Splitter): # Compute linear combination of features and then # sort samples according to the feature values. for jdx in range(0, proj_vec_indices.size()): - col_idx = deref(proj_vec_indices)[jdx] - col_weight = deref(proj_vec_weights)[jdx] + col_idx = proj_vec_indices[jdx] + col_weight = proj_vec_weights[jdx] for idx in range(start, end): # initialize the feature value to 0 @@ -327,7 +327,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # instantiate the split records _init_split(&best_split, end) - # Sample the projection matrix + # Sample the projection matrix by passing in a reference to the underlying + # vector of vectors for weights and indices self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) # For every vector in the projection matrix @@ -339,8 +340,6 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records # Just indicates which split was sampled current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] # Compute linear combination of features and then # sort samples according to the feature values. @@ -349,8 +348,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): end, samples, feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] + self.proj_mat_weights[feat_i], + self.proj_mat_indices[feat_i] ) # Sort the samples @@ -395,6 +394,16 @@ cdef class BestObliqueSplitter(ObliqueSplitter): best_split = current_split # copy + # Note: we do not make a copy above if we are not going to use it + # as the candidate best split + # create a copy of the projection vectors + with gil: + print("here....") + best_split.proj_vec_weights = self.proj_mat_weights[feat_i] + best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + with gil: + print("finished copying...") + # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] if best_split.pos < end: partition_end = end @@ -404,8 +413,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Account for projection vector temp_d = 0.0 for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] + temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ + best_split.proj_vec_weights[j] if temp_d <= best_split.threshold: p += 1 @@ -423,6 +432,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): impurity, best_split.impurity_left, best_split.impurity_right) # Return values + with gil: + print("about to return...") deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights deref(oblique_split).feature = best_split.feature @@ -563,8 +574,6 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records # Just indicates which split was sampled current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] # Compute linear combination of features self.compute_features_over_samples( @@ -572,8 +581,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): end, samples, feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] + self.proj_mat_weights[feat_i], + self.proj_mat_indices[feat_i] ) # find min, max of the feature_values @@ -618,6 +627,11 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): best_proxy_improvement = current_proxy_improvement best_split = current_split # copy + # Note: we do not make a copy above if we are not going to use it + # as the candidate best split + best_split.proj_vec_weights = self.proj_mat_weights[feat_i] + best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + n_visited_features += 1 # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] @@ -629,8 +643,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): # Account for projection vector temp_d = 0.0 for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] + temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ + best_split.proj_vec_weights[j] if temp_d <= best_split.threshold: p += 1 @@ -830,186 +844,6 @@ cdef class MultiViewSplitter(BestObliqueSplitter): if proj_i >= self.max_features: break -# XXX: not used right now -cdef class MultiViewObliqueSplitter(BestObliqueSplitter): - def __cinit__( - self, - Criterion criterion, - intp_t max_features, - intp_t min_samples_leaf, - float64_t min_weight_leaf, - object random_state, - const cnp.int8_t[:] monotonic_cst, - float64_t feature_combinations, - const intp_t[:] feature_set_ends, - intp_t n_feature_sets, - bint uniform_sampling, - *argv - ): - self.feature_set_ends = feature_set_ends - self.uniform_sampling = uniform_sampling - - # infer the number of feature sets - self.n_feature_sets = n_feature_sets - - def __reduce__(self): - """Enable pickling the splitter.""" - return (type(self), - ( - self.criterion, - self.max_features, - self.min_samples_leaf, - self.min_weight_leaf, - self.random_state, - self.monotonic_cst.base if self.monotonic_cst is not None else None, - self.feature_combinations, - self.feature_set_ends, - self.n_feature_sets, - self.uniform_sampling, - ), self.__getstate__()) - - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, - ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - - self.X = X - - # create a helper array for allowing efficient Fisher-Yates - self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) - - cdef intp_t i_feature = 0 - cdef intp_t feature_set_begin = 0 - cdef intp_t size_of_feature_set - cdef intp_t ifeat = 0 - cdef intp_t iproj = 0 - while iproj < self.max_features: - for i_feature in range(self.n_feature_sets): - size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin - - for ifeat in range(size_of_feature_set): - self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin + (iproj * self.n_features)) - iproj += 1 - if iproj >= self.max_features: - break - if iproj >= self.max_features: - break - - feature_set_begin = self.feature_set_ends[i_feature] - return 0 - - cdef void sample_proj_mat( - self, - vector[vector[float32_t]]& proj_mat_weights, - vector[vector[intp_t]]& proj_mat_indices - ) noexcept nogil: - """Sample projection matrix accounting for multi-views. - - This proceeds as a normal sampling projection matrix, - but now also uniformly samples features from each feature set. - """ - cdef intp_t n_features = self.n_features - cdef intp_t n_non_zeros = self.n_non_zeros - cdef UINT32_t* random_state = &self.rand_r_state - - cdef intp_t i, j, feat_i, proj_i, rand_vec_index - cdef float32_t weight - - # construct an array to sample from mTry x n_features set of indices - cdef vector[intp_t] indices_to_sample - cdef intp_t grid_size - - # compute the number of features in each feature set - cdef intp_t n_features_in_set - - # keep track of the beginning and ending indices of each feature set - cdef intp_t feature_set_begin, feature_set_end, idx - feature_set_begin = 0 - - # keep track of number of features sampled relative to n_non_zeros - cdef intp_t ifeature = 0 - - if self.uniform_sampling: - # 01: This algorithm samples features from each feature set uniformly and combines them - # into one sparse projection vector. - while ifeature < n_non_zeros: - for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - n_features_in_set = feature_set_end - feature_set_begin - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() - - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - - # sample a n_non_zeros matrix for each feature set, which proceeds by: - # - sample 'n_non_zeros' in a mtry X n_features projection matrix - # - which consists of +/- 1's chosen at a 1/2s rate - # for i in range(0, n_non_zeros_per_set): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[0] - - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features - feat_i = rand_vec_index % n_features - - # sample a random weight - weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - - # the new beginning is the previous end - feature_set_begin = feature_set_end - - ifeature += 1 - else: - # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates - # them independently. - feature_set_begin = 0 - - # sample from a feature set - for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - n_features_in_set = feature_set_end - feature_set_begin - - # indices to sample is a 1D-index array of size (max_features * n_features_in_set) - # which is Fisher-Yates shuffled to sample random features in each feature set - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() - - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - - for i in range(0, n_non_zeros): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[i] - - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features - feat_i = rand_vec_index % n_features - - # sample a random weight - weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - - # the new beginning is the previous end - feature_set_begin = feature_set_end - cdef class MultiViewSplitterTester(MultiViewSplitter): """A class to expose a Python interface for testing.""" diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 2b0d16d94..2ac1d5f8e 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -245,12 +245,10 @@ cdef class ObliqueTree(Tree): # oblique trees store the projection indices and weights # inside the tree itself - self.proj_vec_weights[node_id] = deref( - deref(oblique_split_node).proj_vec_weights - ) - self.proj_vec_indices[node_id] = deref( - deref(oblique_split_node).proj_vec_indices - ) + # Note: this makes a copy of the projection indices and weights by + # dereferencing the pointer to the split record to get the actual values + self.proj_vec_weights[node_id] = deref(oblique_split_node).proj_vec_weights + self.proj_vec_indices[node_id] = deref(oblique_split_node).proj_vec_indices return 1 cdef float32_t _compute_feature( diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 2081ab852..7d471bfe4 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -11,7 +11,6 @@ cimport numpy as cnp cnp.import_array() -from cython.operator cimport dereference as deref from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion @@ -411,8 +410,8 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -433,13 +432,13 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): feature_values[idx] = 0 for jdx in range(0, proj_vec_indices.size()): feature_values[idx] += self.X[ - samples[idx], deref(proj_vec_indices)[jdx] - ] * deref(proj_vec_weights)[jdx] + samples[idx], proj_vec_indices[jdx] + ] * proj_vec_weights[jdx] if self.feature_weight is not None: # gets the feature weight for this specific column from X # the default of feature_weights[i] is (1/n_features) for all i - patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] + patch_weight += self.feature_weight[samples[idx], proj_vec_indices[jdx]] if self.feature_weight is not None: feature_values[idx] /= patch_weight diff --git a/sktree/tree/tests/meson.build b/sktree/tree/tests/meson.build index c88eeda7c..a35b9fd84 100644 --- a/sktree/tree/tests/meson.build +++ b/sktree/tree/tests/meson.build @@ -7,6 +7,7 @@ python_sources = [ 'test_all_trees.py', 'test_unsupervised_tree.py', 'test_multiview.py', + 'test_oblique_trees.py', ] py3.install_sources( diff --git a/sktree/tree/tests/test_oblique_trees.py b/sktree/tree/tests/test_oblique_trees.py new file mode 100644 index 000000000..45f87eaa8 --- /dev/null +++ b/sktree/tree/tests/test_oblique_trees.py @@ -0,0 +1,34 @@ +import numpy as np + +from sktree import HonestForestClassifier +from sktree.tree import MultiViewDecisionTreeClassifier + + +def test_oblique_tree(): + """Test regression reported in https://github.com/neurodata/scikit-tree/issues/215.""" + n, a = ( + 10, + 20, + ) + x = np.random.normal(size=(n, a)) + y = np.random.binomial(1, 0.5, size=(n)) + + for seed in range(100): + # est = MultiViewDecisionTreeClassifier( + # max_features=0.3, + # feature_set_ends=[15, 20], + # random_state=seed, + # ) + + est = HonestForestClassifier( + n_estimators=10, + max_features=0.3, + feature_set_ends=[15, 20], + # bootstrap=True, + # max_samples=1.6, + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=seed, + n_jobs=-1, + ) + + est.fit(x, y) diff --git a/test_tree.py b/test_tree.py new file mode 100644 index 000000000..2de2942b9 --- /dev/null +++ b/test_tree.py @@ -0,0 +1,32 @@ +import numpy as np + +from sktree import HonestForestClassifier +from sktree.tree import MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier + +"""Test regression reported in https://github.com/neurodata/scikit-tree/issues/215.""" +n, a = ( + 10, + 20, +) +x = np.random.normal(size=(n, a)) +y = np.random.binomial(1, 0.5, size=(n)) + +for seed in range(100): + # est = MultiViewDecisionTreeClassifier( + # max_features=0.3, + # feature_set_ends=[15, 20], + # random_state=seed, + # ) + + est = HonestForestClassifier( + n_estimators=10, + max_features=0.3, + feature_set_ends=[15, 20], + # bootstrap=True, + # max_samples=1.6, + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=seed, + n_jobs=-1, + ) + + est.fit(x, y) From 43cb7a73d937c6bdb706be4b19be112d96e08edb Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 28 Feb 2024 12:14:54 -0500 Subject: [PATCH 02/27] Simplify the design? Signed-off-by: Adam Li --- sktree/tree/_oblique_splitter.pxd | 6 ++- sktree/tree/_oblique_splitter.pyx | 70 ++++++++++++++++++------------- sktree/tree/_oblique_tree.pxd | 2 +- sktree/tree/_oblique_tree.pyx | 5 ++- sktree/tree/tests/test_tree.py | 24 +++++++---- test_tree.py | 29 +++++++++---- 6 files changed, 88 insertions(+), 48 deletions(-) diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 359a072b1..16678a492 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -31,6 +31,10 @@ cdef struct ObliqueSplitRecord: float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. + # XXX: By storing the values, they will have to be copied into the split record + # which may be inefficient. + # Inside the tree, they will also again be copied into a Node, which will + # in total incur two copies, whereas we ideally only need one copy. vector[float32_t] proj_vec_weights # weights of the vector (max_features,) vector[intp_t] proj_vec_indices # indices of the features (max_features,) @@ -43,7 +47,7 @@ cdef class BaseObliqueSplitter(Splitter): cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix - # TODO: assumes all oblique splitters only work with dense data + # TODO: assumes all oblique splitters only work with dense data; make work with sparse arrays too cdef const float32_t[:, :] X # feature weights across (n_dims,) diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 373f7074d..264deea36 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -322,7 +322,7 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t feat_i, p # index over computed features and start/end cdef intp_t partition_end - cdef float32_t temp_d # to compute a projection feature value + cdef float32_t temp_d # to compute a projection feature value # instantiate the split records _init_split(&best_split, end) @@ -337,8 +337,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): if self.proj_mat_weights[feat_i].empty(): continue - # XXX: 'feature' is not actually used in oblique split records - # Just indicates which split was sampled + # Note: 'feature' is not actually used in oblique split records + # Just indicates which index was sampled in the sampled projection matrix current_split.feature = feat_i # Compute linear combination of features and then @@ -395,14 +395,12 @@ cdef class BestObliqueSplitter(ObliqueSplitter): best_split = current_split # copy # Note: we do not make a copy above if we are not going to use it - # as the candidate best split - # create a copy of the projection vectors - with gil: - print("here....") - best_split.proj_vec_weights = self.proj_mat_weights[feat_i] - best_split.proj_vec_indices = self.proj_mat_indices[feat_i] - with gil: - print("finished copying...") + # as the candidate best split, so here we create a pointer to the + # copy of the projection vectors + # best_split.proj_vec_weights = self.proj_mat_weights[feat_i] + # best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + # best_proj_vec_weights = &self.proj_mat_weights[feat_i] + # best_proj_vec_indices = &self.proj_mat_indices[feat_i] # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] if best_split.pos < end: @@ -412,9 +410,12 @@ cdef class BestObliqueSplitter(ObliqueSplitter): while p < partition_end: # Account for projection vector temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ - best_split.proj_vec_weights[j] + # for j in range(best_split.proj_vec_indices.size()): + # # temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ + # # best_split.proj_vec_weights[j] + for j in range(self.proj_mat_weights[best_split.feature].size()): + temp_d += self.X[samples[p], self.proj_mat_indices[best_split.feature][j]] *\ + self.proj_mat_weights[best_split.feature][j] if temp_d <= best_split.threshold: p += 1 @@ -431,11 +432,14 @@ cdef class BestObliqueSplitter(ObliqueSplitter): best_split.improvement = self.criterion.impurity_improvement( impurity, best_split.impurity_left, best_split.impurity_right) - # Return values - with gil: - print("about to return...") - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + # Ensure that the projection vectors are copied into the underlying split record that is + # seen by the tree builder + # deref(oblique_split).proj_vec_indices = deref(best_proj_vec_indices) # best_split.proj_vec_indices + # deref(oblique_split).proj_vec_weights = deref(best_proj_vec_weights) # best_split.proj_vec_weights + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] + deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] + + # Dereference the pointer to the split record and set the values here deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos deref(oblique_split).threshold = best_split.threshold @@ -571,8 +575,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): if self.proj_mat_weights[feat_i].empty(): continue - # XXX: 'feature' is not actually used in oblique split records - # Just indicates which split was sampled + # Note: 'feature' is not actually used in oblique split records + # Just indicates which index was sampled in the sampled projection matrix current_split.feature = feat_i # Compute linear combination of features @@ -629,8 +633,12 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): # Note: we do not make a copy above if we are not going to use it # as the candidate best split - best_split.proj_vec_weights = self.proj_mat_weights[feat_i] - best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + # The self.proj_mat_weights and self.proj_mat_indices already contain + # the best projection vector found at `best_split.feature` (i.e. feat_i). + # best_split.proj_vec_weights = self.proj_mat_weights[feat_i] + # best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + # best_proj_vec_indices = &self.proj_mat_indices[feat_i] + # best_proj_vec_weights = &self.proj_mat_weights[feat_i] n_visited_features += 1 @@ -642,9 +650,12 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): while p < partition_end: # Account for projection vector temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ - best_split.proj_vec_weights[j] + # for j in range(best_split.proj_vec_indices.size()): + # temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ + # best_split.proj_vec_weights[j] + for j in range(self.proj_mat_indices[best_split.feature].size()): + temp_d += self.X[samples[p], self.proj_mat_indices[best_split.feature][j]] *\ + self.proj_mat_weights[best_split.feature][j] if temp_d <= best_split.threshold: p += 1 @@ -661,9 +672,12 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): best_split.improvement = self.criterion.impurity_improvement( impurity, best_split.impurity_left, best_split.impurity_right) + # deref(oblique_split).proj_vec_indices = deref(best_proj_vec_indices) # best_split.proj_vec_indices + # deref(oblique_split).proj_vec_weights = deref(best_proj_vec_weights) # best_split.proj_vec_weights + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] + deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] + # Return values - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos deref(oblique_split).threshold = best_split.threshold diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd index 50c7949bf..d4597dbdb 100644 --- a/sktree/tree/_oblique_tree.pxd +++ b/sktree/tree/_oblique_tree.pxd @@ -21,7 +21,7 @@ from ._oblique_splitter cimport ObliqueSplitRecord cdef class ObliqueTree(Tree): cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors - cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors + cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors # overridden methods cdef int _resize_c( diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 2ac1d5f8e..ee60364ad 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -243,8 +243,9 @@ cdef class ObliqueTree(Tree): node.feature = deref(oblique_split_node).feature node.threshold = deref(oblique_split_node).threshold - # oblique trees store the projection indices and weights - # inside the tree itself + # TODO: this is not efficient. There are currently two copies being done of the optimal projection vector. + # One here and one within the splitter. We should try to remove one of these copies in a new design. + # oblique trees store the projection indices and weights inside the tree itself # Note: this makes a copy of the projection indices and weights by # dereferencing the pointer to the split record to get the actual values self.proj_vec_weights[node_id] = deref(oblique_split_node).proj_vec_weights diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py index 94cdfef62..1eb223c40 100644 --- a/sktree/tree/tests/test_tree.py +++ b/sktree/tree/tests/test_tree.py @@ -259,15 +259,18 @@ def test_oblique_tree_sampling(Tree, random_state=0): assert rc_cv_scores.mean() > 0.91 +@pytest.mark.parametrize("splitter", ["best", "random"]) @pytest.mark.parametrize("Tree", OBLIQUE_TREES.values()) -def test_oblique_trees_feature_combinations_less_than_n_features(Tree): +def test_oblique_trees_feature_combinations_less_than_n_features(Tree, splitter): """Test the hyperparameter ``feature_combinations`` behaves properly.""" X, y = iris.data[:5, :], iris.target[:5, ...] _, n_features = X.shape # asset that the feature combinations is less than the number of features - estimator = ObliqueDecisionTreeClassifier(random_state=0, feature_combinations=3) + estimator = ObliqueDecisionTreeClassifier( + splitter=splitter, random_state=0, feature_combinations=3 + ) estimator.fit(X, y) assert estimator.feature_combinations_ < n_features @@ -280,8 +283,9 @@ def test_oblique_trees_feature_combinations_less_than_n_features(Tree): assert estimator.feature_combinations_ < n_features +@pytest.mark.parametrize("splitter", ["best", "random"]) @pytest.mark.parametrize("Tree", OBLIQUE_TREES.values()) -def test_oblique_trees_feature_combinations(Tree): +def test_oblique_trees_feature_combinations(Tree, splitter): """Test the hyperparameter ``feature_combinations`` behaves properly.""" if is_classifier(Tree): @@ -296,27 +300,27 @@ def test_oblique_trees_feature_combinations(Tree): with pytest.raises( RuntimeError, match=f"Feature combinations {n_features + 1} should not be greater" ): - estimator = Tree(random_state=0, feature_combinations=n_features + 1) + estimator = Tree(splitter=splitter, random_state=0, feature_combinations=n_features + 1) estimator.fit(X, y) # asset that the feature combinations is less than the number of features - estimator = Tree(random_state=0, feature_combinations=3) + estimator = Tree(splitter=splitter, random_state=0, feature_combinations=3) estimator.fit(X, y) assert estimator.feature_combinations_ < n_features # default option should make it 1.5 if n_features > 1.5 - estimator = Tree(random_state=0) + estimator = Tree(splitter=splitter, random_state=0) estimator.fit(X, y) assert estimator.feature_combinations_ == 1.5 # setting the feature combinations explicitly is fine as long as it is < n_features - estimator = Tree(random_state=0, feature_combinations=3) + estimator = Tree(splitter=splitter, random_state=0, feature_combinations=3) estimator.fit(X, y) assert estimator.feature_combinations_ == 3 # edge-case of only a single feature should set feature_combinations properly X = X[:, 0:1] - estimator = Tree(random_state=0) + estimator = Tree(splitter=splitter, random_state=0) estimator.fit(X, y) assert estimator.feature_combinations_ == 1 @@ -524,7 +528,9 @@ def test_balance_property(criterion, Tree): X, y = diabetes.data, diabetes.target reg = Tree(criterion=criterion) reg.fit(X, y) - assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) + assert np.sum(reg.predict(X)) == pytest.approx( + np.sum(y) + ), f"Failed with {Tree} and {criterion}: {np.sum(reg.predict(X))} != {np.sum(y)}" @pytest.mark.parametrize("Tree", ALL_TREES.values()) diff --git a/test_tree.py b/test_tree.py index 2de2942b9..d79452236 100644 --- a/test_tree.py +++ b/test_tree.py @@ -1,30 +1,45 @@ import numpy as np -from sktree import HonestForestClassifier +from sktree import ( + ExtraObliqueRandomForestClassifier, + HonestForestClassifier, + ObliqueRandomForestClassifier, +) from sktree.tree import MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier """Test regression reported in https://github.com/neurodata/scikit-tree/issues/215.""" n, a = ( - 10, + 10000, 20, ) x = np.random.normal(size=(n, a)) y = np.random.binomial(1, 0.5, size=(n)) -for seed in range(100): +for seed in range(1): # est = MultiViewDecisionTreeClassifier( # max_features=0.3, # feature_set_ends=[15, 20], # random_state=seed, # ) - est = HonestForestClassifier( + # est = HonestForestClassifier( + # n_estimators=10, + # max_features=0.3, + # feature_set_ends=[15, 20], + # # bootstrap=True, + # # max_samples=1.6, + # tree_estimator=MultiViewDecisionTreeClassifier(), + # random_state=seed, + # n_jobs=-1, + # ) + + est = ExtraObliqueRandomForestClassifier( n_estimators=10, max_features=0.3, - feature_set_ends=[15, 20], - # bootstrap=True, + # feature_set_ends=[15, 20], + bootstrap=True, # max_samples=1.6, - tree_estimator=MultiViewDecisionTreeClassifier(), + # splitter='random', random_state=seed, n_jobs=-1, ) From c2e2a54baf03dcd589d8c002acca90defe0d9f72 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 28 Feb 2024 17:22:20 -0500 Subject: [PATCH 03/27] Update submodule Signed-off-by: Adam Li --- sktree/_lib/sklearn_fork | 2 +- sktree/tree/unsupervised/_unsup_oblique_tree.pyx | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 94517a38a..db5b137c1 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 94517a38a6354ee02ef715d1077c8ec6d1713d3b +Subproject commit db5b137c1c1d2cb90aed2354dcb5b933e1df803b diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 5b77b9b04..f621b6985 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -225,12 +225,8 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # oblique trees store the projection indices and weights # inside the tree itself - self.proj_vec_weights[node_id] = deref( - deref(oblique_split_node).proj_vec_weights - ) - self.proj_vec_indices[node_id] = deref( - deref(oblique_split_node).proj_vec_indices - ) + self.proj_vec_weights[node_id] = deref(oblique_split_node).proj_vec_weights + self.proj_vec_indices[node_id] = deref(oblique_split_node).proj_vec_indices return 1 cdef float32_t _compute_feature( From ec852a241520ceea8d2cfc15a7d4f67d62a86483 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 10:59:32 -0500 Subject: [PATCH 04/27] Remove unnecessary requirements files Signed-off-by: Adam Li --- .circleci/config.yml | 4 ++-- .github/workflows/build_wheels.yml | 6 +++--- .github/workflows/main.yml | 12 ++++++------ .github/workflows/style.yml | 2 +- CONTRIBUTING.md | 2 +- DEVELOPING.md | 10 +++++----- README.md | 2 +- doc/install.rst | 2 +- pyproject.toml | 12 ++++++------ 9 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index acba98a00..f24f135fb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -81,8 +81,8 @@ jobs: - run: name: setup Python venv command: | - pip install -r build_requirements.txt - pip install -r doc_requirements.txt + pip install .[build] + pip install .[doc] - run: name: build scikit-tree command: | diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 45db9bf20..6dd07873c 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -60,7 +60,7 @@ jobs: - name: Setup submodule run: | - python -m pip install -r build_requirements.txt + python -m pip install .[build] python -m pip install spin python spin setup-submodule @@ -115,7 +115,7 @@ jobs: - name: Install cibuildwheel dependencies run: | python -m pip install cibuildwheel - python -m pip install -r build_requirements.txt + python -m pip install .[build] python -m pip install spin python spin setup-submodule @@ -146,7 +146,7 @@ jobs: - name: Build source distribution run: | - python -m pip install -r build_requirements.txt + python -m pip install .[build] python -m pip install spin python spin setup-submodule python -m build diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 80bcb739e..455326153 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,9 +65,9 @@ jobs: - name: Install Python packages run: | - python -m pip install -r build_requirements.txt + python -m pip install .[build] python -m pip install spin - python -m pip install -r test_requirements.txt + python -m pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -172,9 +172,9 @@ jobs: - name: Install Python packages run: | - python -m pip install -r build_requirements.txt + python -m pip install .[build] python -m pip install spin - python -m pip install -r test_requirements.txt + python -m pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -287,8 +287,8 @@ jobs: # - name: pip-packages # run: | -# pip install -r build_requirements.txt -# pip install -r test_requirements.txt +# pip install .[build] +# pip install .[test] # pip install spin # pip install numpy==1.22.4 diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 30173a454..dba371019 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -41,7 +41,7 @@ jobs: sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev - name: Install dependencies - run: pip install -r style_requirements.txt + run: pip install .[style] # check formatting of the code style - name: Check code formatting diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 73c6c70d5..6278bbdb4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -93,7 +93,7 @@ When you're ready to contribute code to address an open issue, please follow the Once your virtual environment is activated, you can install your local clone in "editable mode" with - pip install -r build_requirements.txt + pip install .[build] pip install -e . The "editable mode" comes from the `-e` argument to `pip`, and essential just creates a symbolic link from the site-packages directory of your virtual environment to the source code in your local clone. That way any changes you make will be immediately reflected in your virtual environment. diff --git a/DEVELOPING.md b/DEVELOPING.md index 3f2e645fe..6103f340e 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -39,14 +39,14 @@ Assuming these steps have worked properly and you have read and followed any nec If you are developing locally, you will need the build dependencies to compile the Cython / C++ code: - pip install -r build_requirements.txt + pip install .[build] Other requirements can be installed as such: - pip install -r requirements.txt - pip install -r style_requirements.txt - pip install -r test_requirements.txt - pip install -r doc_requirements.txt + pip install . + pip install .[style] + pip install .[test] + pip install .[doc] # Building the project from source diff --git a/README.md b/README.md index e8c767478..68e2ce946 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Building locally with Meson (For developers) Make sure you have the necessary packages installed # install build dependencies - pip install -r build_requirements.txt + pip install .[build] # you may need these optional dependencies to build scikit-learn locally conda install -c conda-forge joblib threadpoolctl pytest compilers llvm-openmp diff --git a/doc/install.rst b/doc/install.rst index 67edeeb23..099693b85 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -41,7 +41,7 @@ Then run installation of build packages .. code-block:: bash - pip install -r build_requirements.txt + pip install .[build] pip install spin # use spin CLI to run Meson build locally diff --git a/pyproject.toml b/pyproject.toml index ed0e197d9..6d684edff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] build-backend = "mesonpy" requires = [ - "meson-python>=0.13.0", + "meson-python>=0.15.0", 'ninja', # `wheel` is needed for non-isolated builds, given that `meson-python` # doesn't list it as a runtime requirement (at least in 0.10.0) @@ -9,8 +9,8 @@ requires = [ "wheel", "setuptools<=65.5", "packaging", - "Cython>=0.29.24", - "scikit-learn>=1.4", + "Cython>=3.0.8", + "scikit-learn>=1.4.1", "scipy>=1.5.0", "numpy>=1.25; python_version>='3.9'" ] @@ -51,7 +51,7 @@ include = [ dependencies = [ 'numpy', 'scipy>=1.5.0', - 'scikit-learn>=1.3.1' + 'scikit-learn>=1.4.1' ] [project.optional-dependencies] @@ -68,8 +68,8 @@ build = [ 'meson-python', 'spin', 'doit', - 'scikit-learn>=1.3.1', - 'Cython>=0.29.36', + 'scikit-learn>=1.4.1', + 'Cython>=3.0.8', 'ninja', 'numpy', 'rich-click', From f87534af989ee743a483ee584701bbce98dc85ba Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 11:46:46 -0500 Subject: [PATCH 05/27] Update installation Signed-off-by: Adam Li --- .circleci/config.yml | 2 +- .github/workflows/build_wheels.yml | 24 ++++++++++++------------ .github/workflows/main.yml | 21 +++++++++++---------- .github/workflows/release.yml | 4 ++-- spin | 2 +- 5 files changed, 27 insertions(+), 26 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f24f135fb..d3fb56085 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -86,7 +86,7 @@ jobs: - run: name: build scikit-tree command: | - python spin build -j2 + spin build -j2 - save_cache: key: deps_ccache-{{ .Branch }} diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 6dd07873c..ee3f39624 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -60,9 +60,9 @@ jobs: - name: Setup submodule run: | - python -m pip install .[build] - python -m pip install spin - python spin setup-submodule + pip install .[build] + pip install spin + spin setup-submodule - name: win_amd64 - install rtools run: | @@ -79,7 +79,7 @@ jobs: # configuration step to create a build directory. The subsequent wheel # build then needs to use that directory. This can be done with pip # using a command like: - # python -m pip wheel --config-settings builddir=build . + # pip wheel --config-settings builddir=build . # if: >- # ( ! contains(matrix.os[2], 'arm64' ) ) env: @@ -114,10 +114,10 @@ jobs: - name: Install cibuildwheel dependencies run: | - python -m pip install cibuildwheel - python -m pip install .[build] - python -m pip install spin - python spin setup-submodule + pip install cibuildwheel + pip install .[build] + pip install spin + spin setup-submodule - name: Build wheels uses: pypa/cibuildwheel@v2.16.5 @@ -146,10 +146,10 @@ jobs: - name: Build source distribution run: | - python -m pip install .[build] - python -m pip install spin - python spin setup-submodule - python -m build + pip install .[build] + pip install spin + spin setup-submodule + build - name: Store artifacts uses: actions/upload-artifact@v4 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 455326153..dfb2416be 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,9 +65,10 @@ jobs: - name: Install Python packages run: | - python -m pip install .[build] - python -m pip install spin - python -m pip install .[test] + pip install .[build] + pip install spin + pip install .[test] + spin setup-submodule - name: Prepare compiler cache id: prep-ccache @@ -172,9 +173,9 @@ jobs: - name: Install Python packages run: | - python -m pip install .[build] - python -m pip install spin - python -m pip install .[test] + pip install .[build] + pip install spin + pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -232,7 +233,7 @@ jobs: - name: Upload coverage stats to codecov uses: codecov/codecov-action@v4 with: - # python spin goes into the INSTALLED path in order to run pytest + # spin goes into the INSTALLED path in order to run pytest files: ./coverage.xml fail_ci_if_error: true verbose: true @@ -319,8 +320,8 @@ jobs: # run: | # echo "SCIPY_USE_PROPACK=1" >> $env:GITHUB_ENV # echo "FORCE_SUBMODULE=True" >> $env:GITHUB_ENV -# python spin setup_submodule --forcesubmodule -# python spin build -j 2 +# spin setup_submodule --forcesubmodule +# spin build -j 2 # # Necessary because GitHub Actions checks out the repo to D:\ while OpenBLAS # # got installed to C:\ higher up. The copying with `--win-cp-openblas` fails # # when things are split over drives. @@ -329,4 +330,4 @@ jobs: # - name: test # run: | -# python spin test +# spin test diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 60ef6fe0f..fd574cde2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,8 +32,8 @@ jobs: architecture: "x64" - name: Install dependencies run: | - python -m pip install --progress-bar off --upgrade pip - python -m pip install --progress-bar off build twine + pip install --progress-bar off --upgrade pip + pip install --progress-bar off build twine - name: Prepare environment run: | echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV diff --git a/spin b/spin index 7e69cd06b..f23a70790 100755 --- a/spin +++ b/spin @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Example stub for running `python -m spin` +# Example stub for running `spin` # # Copy this into your project root. From f1fc2512d46536a991163eef2a78ca2a50fb17c9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 11:48:06 -0500 Subject: [PATCH 06/27] Change cache path Signed-off-by: Adam Li --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dfb2416be..f4024cddc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,7 +46,7 @@ jobs: python-version: ${{ matrix.python-version }} architecture: "x64" cache: "pip" - cache-dependency-path: "requirements.txt" + cache-dependency-path: "pyproject.toml" - name: show-gcc run: | @@ -154,7 +154,7 @@ jobs: python-version: ${{ matrix.python-version }} architecture: "x64" cache: "pip" - cache-dependency-path: "requirements.txt" + cache-dependency-path: "pyproject.toml" - name: show-gcc run: | @@ -259,7 +259,7 @@ jobs: # python-version: "3.10" # architecture: "x64" # cache: "pip" -# cache-dependency-path: "requirements.txt" +# cache-dependency-path: "pyproject.toml" # - name: install-rtools # run: | From fe0ba93444356f0a5f9dc830bc49a24dcbf3e06c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 11:59:25 -0500 Subject: [PATCH 07/27] Fix installation Signed-off-by: Adam Li --- .github/workflows/build_wheels.yml | 6 +++--- .github/workflows/main.yml | 8 ++++---- .github/workflows/style.yml | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index ee3f39624..2eb20d41a 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -60,7 +60,7 @@ jobs: - name: Setup submodule run: | - pip install .[build] + pip install .[build] --only-extras pip install spin spin setup-submodule @@ -115,7 +115,7 @@ jobs: - name: Install cibuildwheel dependencies run: | pip install cibuildwheel - pip install .[build] + pip install .[build] --only-extras pip install spin spin setup-submodule @@ -146,7 +146,7 @@ jobs: - name: Build source distribution run: | - pip install .[build] + pip install .[build] --only-extras pip install spin spin setup-submodule build diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f4024cddc..26a788c44 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,9 +65,9 @@ jobs: - name: Install Python packages run: | - pip install .[build] + pip install .[build] --only-extras pip install spin - pip install .[test] + pip install .[test] --only-extras spin setup-submodule - name: Prepare compiler cache @@ -173,9 +173,9 @@ jobs: - name: Install Python packages run: | - pip install .[build] + pip install .[build] --only-extras pip install spin - pip install .[test] + pip install .[test] --only-extras - name: Prepare compiler cache id: prep-ccache diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index dba371019..a275e11d4 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -41,7 +41,7 @@ jobs: sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev - name: Install dependencies - run: pip install .[style] + run: pip install .[style] --only-extras # check formatting of the code style - name: Check code formatting From 013ae608b04cd1778f4d307700a7e6ec7ef5fdc6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 12:08:38 -0500 Subject: [PATCH 08/27] Trya gain Signed-off-by: Adam Li --- .github/workflows/build_wheels.yml | 7 ++++--- .github/workflows/main.yml | 9 +++++---- .github/workflows/style.yml | 5 ++++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 2eb20d41a..5e43e29b0 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -60,9 +60,10 @@ jobs: - name: Setup submodule run: | - pip install .[build] --only-extras + pip install --upgrade pip pip install spin spin setup-submodule + pip install .[build] - name: win_amd64 - install rtools run: | @@ -115,9 +116,9 @@ jobs: - name: Install cibuildwheel dependencies run: | pip install cibuildwheel - pip install .[build] --only-extras pip install spin spin setup-submodule + pip install .[build] - name: Build wheels uses: pypa/cibuildwheel@v2.16.5 @@ -146,9 +147,9 @@ jobs: - name: Build source distribution run: | - pip install .[build] --only-extras pip install spin spin setup-submodule + pip install .[build] build - name: Store artifacts diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 26a788c44..432e78517 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,10 +65,10 @@ jobs: - name: Install Python packages run: | - pip install .[build] --only-extras pip install spin - pip install .[test] --only-extras spin setup-submodule + pip install .[build] + pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -173,9 +173,10 @@ jobs: - name: Install Python packages run: | - pip install .[build] --only-extras pip install spin - pip install .[test] --only-extras + spin setup-submodule + pip install .[build] + pip install .[test] - name: Prepare compiler cache id: prep-ccache diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index a275e11d4..b9369698d 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -41,7 +41,10 @@ jobs: sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev - name: Install dependencies - run: pip install .[style] --only-extras + run: | + pip install --upgrade pip spin + spin setup-submodule + pip install .[style] # check formatting of the code style - name: Check code formatting From 1034206f13ae5e698ddc973c738617e28eb4f73f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 12:15:22 -0500 Subject: [PATCH 09/27] Fix install in circleci now Signed-off-by: Adam Li --- .circleci/config.yml | 2 ++ .github/workflows/build_wheels.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d3fb56085..13e071cd3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -81,6 +81,8 @@ jobs: - run: name: setup Python venv command: | + pip install --upgrade pip spin + spin setup-submodule pip install .[build] pip install .[doc] - run: diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 5e43e29b0..b05fb7047 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -147,7 +147,7 @@ jobs: - name: Build source distribution run: | - pip install spin + pip install spin build spin setup-submodule pip install .[build] build From b0f14ea86f75e8a4be3ef0011db0d8189e3b7da0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 12:30:17 -0500 Subject: [PATCH 10/27] Try again Signed-off-by: Adam Li --- .circleci/config.yml | 2 +- .github/workflows/build_wheels.yml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 13e071cd3..e4132be70 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -129,7 +129,7 @@ jobs: echo "After..." ls build-install/** export PYTHONPATH=$PWD/build-install/usr/lib/python3.8/site-packages - python ./spin docs + ./spin docs - store_artifacts: path: doc/_build/html diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index b05fb7047..1db9bae92 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -60,10 +60,10 @@ jobs: - name: Setup submodule run: | - pip install --upgrade pip - pip install spin - spin setup-submodule - pip install .[build] + python -m pip install --upgrade pip + python -m pip install spin + python -m spin setup-submodule + python -m pip install .[build] - name: win_amd64 - install rtools run: | @@ -150,7 +150,7 @@ jobs: pip install spin build spin setup-submodule pip install .[build] - build + python -m build - name: Store artifacts uses: actions/upload-artifact@v4 From 1dabd3ceeb071e5e28480b5efd2e13c864743f33 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 12:39:13 -0500 Subject: [PATCH 11/27] Rever gh actions Signed-off-by: Adam Li --- .github/workflows/main.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 432e78517..178d160d5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,10 +65,10 @@ jobs: - name: Install Python packages run: | - pip install spin - spin setup-submodule - pip install .[build] - pip install .[test] + python -m pip install spin + python -m spin setup-submodule + python -m pip install .[build] + python -m pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -173,10 +173,10 @@ jobs: - name: Install Python packages run: | - pip install spin - spin setup-submodule - pip install .[build] - pip install .[test] + python -m pip install spin + python -m spin setup-submodule + python -m pip install .[build] + python -m pip install .[test] - name: Prepare compiler cache id: prep-ccache From 667a29e20a5ddaaf5123390fa25d487ded34c45c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 13:19:56 -0500 Subject: [PATCH 12/27] Try again Signed-off-by: Adam Li --- .github/workflows/main.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 178d160d5..8b57bd2c6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -48,20 +48,22 @@ jobs: cache: "pip" cache-dependency-path: "pyproject.toml" - - name: show-gcc + - name: Install packages for Ubuntu + if: ${{ matrix.os == 'ubuntu-22.04'}} run: | - gcc --version + sudo apt-get update + sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev + sudo apt-get install -y gcc - name: Install Ccache for MacOSX if: ${{ matrix.os == 'macos-latest'}} run: | brew install ccache + brew install gcc - - name: Install packages for Ubuntu - if: ${{ matrix.os == 'ubuntu-22.04'}} + - name: show-gcc run: | - sudo apt-get update - sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev + gcc --version - name: Install Python packages run: | From e3a8ac9df2bb6cc7395e1090100c0a75130bee2d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 13:39:50 -0500 Subject: [PATCH 13/27] Fix Signed-off-by: Adam Li --- sktree/tree/_oblique_tree.pyx | 7 +++ .../unsupervised/_unsup_oblique_splitter.pxd | 8 ++-- .../unsupervised/_unsup_oblique_splitter.pyx | 29 +++++++----- test_tree.py | 47 ------------------- 4 files changed, 27 insertions(+), 64 deletions(-) delete mode 100644 test_tree.py diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index ee60364ad..8aeb1750f 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -136,8 +136,11 @@ cdef class ObliqueTree(Tree): d["nodes"] = self._get_node_ndarray() d["values"] = self._get_value_ndarray() + print("getting state") proj_vecs = self.get_projection_matrix() d["proj_vecs"] = proj_vecs + + print("got state...") return d def __setstate__(self, d): @@ -145,6 +148,7 @@ cdef class ObliqueTree(Tree): self.max_depth = d["max_depth"] self.node_count = d["node_count"] + print("settin gstate...") if "nodes" not in d: raise ValueError("You have loaded ObliqueTree version which " "cannot be imported") @@ -165,6 +169,7 @@ cdef class ObliqueTree(Tree): if self._resize_c(self.capacity) != 0: raise MemoryError("resizing tree to %d" % self.capacity) + print("about to set proj vec...") # now set the projection vector weights and indices proj_vecs = d["proj_vecs"] self.n_features = proj_vecs.shape[1] @@ -184,11 +189,13 @@ cdef class ObliqueTree(Tree): cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" proj_vecs = np.zeros((self.node_count, self.n_features), dtype=np.float64) + print("geting projection matrix...") for i in range(0, self.node_count): for j in range(0, self.proj_vec_weights[i].size()): weight = self.proj_vec_weights[i][j] feat = self.proj_vec_indices[i][j] proj_vecs[i, feat] = weight + print("got projection matrix") return proj_vecs cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd index 1e65a49b1..e8dce72ba 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd @@ -20,8 +20,8 @@ cdef struct ObliqueSplitRecord: float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. - vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t] proj_vec_weights # weights of the vector (max_features,) + vector[intp_t] proj_vec_indices # indices of the features (max_features,) cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): @@ -73,6 +73,6 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 53b2bbd43..6df258581 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -172,8 +172,8 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -187,8 +187,8 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): # Compute linear combination of features and then # sort samples according to the feature values. for jdx in range(0, proj_vec_indices.size()): - col_idx = deref(proj_vec_indices)[jdx] - col_weight = deref(proj_vec_weights)[jdx] + col_idx = proj_vec_indices[jdx] + col_weight = proj_vec_weights[jdx] for idx in range(start, end): # initialize the feature value to 0 @@ -294,8 +294,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records because it normally indicates the column # Just indicates which split was sampled current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] + # current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] + # current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] # Compute linear combination of features and then # sort samples according to the feature values. @@ -304,8 +304,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): end, samples, feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] + self.proj_mat_weights[feat_i], + self.proj_mat_indices[feat_i] ) # Sort the samples @@ -362,9 +362,9 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): while p < partition_end: # Account for projection vector temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] + for j in range(self.proj_mat_weights[best_split.feature].size()): + temp_d += self.X[samples[p], self.proj_mat_indices[best_split.feature][j]] *\ + self.proj_mat_weights[best_split.feature][j] if temp_d <= best_split.threshold: p += 1 @@ -381,9 +381,12 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): best_split.improvement = self.criterion.impurity_improvement( impurity, best_split.impurity_left, best_split.impurity_right) + # deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + # deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] + deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] + # Return values - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos deref(oblique_split).threshold = best_split.threshold diff --git a/test_tree.py b/test_tree.py deleted file mode 100644 index d79452236..000000000 --- a/test_tree.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np - -from sktree import ( - ExtraObliqueRandomForestClassifier, - HonestForestClassifier, - ObliqueRandomForestClassifier, -) -from sktree.tree import MultiViewDecisionTreeClassifier, ObliqueDecisionTreeClassifier - -"""Test regression reported in https://github.com/neurodata/scikit-tree/issues/215.""" -n, a = ( - 10000, - 20, -) -x = np.random.normal(size=(n, a)) -y = np.random.binomial(1, 0.5, size=(n)) - -for seed in range(1): - # est = MultiViewDecisionTreeClassifier( - # max_features=0.3, - # feature_set_ends=[15, 20], - # random_state=seed, - # ) - - # est = HonestForestClassifier( - # n_estimators=10, - # max_features=0.3, - # feature_set_ends=[15, 20], - # # bootstrap=True, - # # max_samples=1.6, - # tree_estimator=MultiViewDecisionTreeClassifier(), - # random_state=seed, - # n_jobs=-1, - # ) - - est = ExtraObliqueRandomForestClassifier( - n_estimators=10, - max_features=0.3, - # feature_set_ends=[15, 20], - bootstrap=True, - # max_samples=1.6, - # splitter='random', - random_state=seed, - n_jobs=-1, - ) - - est.fit(x, y) From b7e4afb4d7af05398a9d6f51c831b3870bc83c2c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 14:10:51 -0500 Subject: [PATCH 14/27] Remove prints Signed-off-by: Adam Li --- sktree/tree/_oblique_tree.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 8aeb1750f..3e43b14cc 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -136,11 +136,9 @@ cdef class ObliqueTree(Tree): d["nodes"] = self._get_node_ndarray() d["values"] = self._get_value_ndarray() - print("getting state") proj_vecs = self.get_projection_matrix() d["proj_vecs"] = proj_vecs - print("got state...") return d def __setstate__(self, d): @@ -148,7 +146,6 @@ cdef class ObliqueTree(Tree): self.max_depth = d["max_depth"] self.node_count = d["node_count"] - print("settin gstate...") if "nodes" not in d: raise ValueError("You have loaded ObliqueTree version which " "cannot be imported") @@ -169,7 +166,6 @@ cdef class ObliqueTree(Tree): if self._resize_c(self.capacity) != 0: raise MemoryError("resizing tree to %d" % self.capacity) - print("about to set proj vec...") # now set the projection vector weights and indices proj_vecs = d["proj_vecs"] self.n_features = proj_vecs.shape[1] @@ -189,13 +185,11 @@ cdef class ObliqueTree(Tree): cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" proj_vecs = np.zeros((self.node_count, self.n_features), dtype=np.float64) - print("geting projection matrix...") for i in range(0, self.node_count): for j in range(0, self.proj_vec_weights[i].size()): weight = self.proj_vec_weights[i][j] feat = self.proj_vec_indices[i][j] proj_vecs[i, feat] = weight - print("got projection matrix") return proj_vecs cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: From 28120ebdc3a8cba03361491b048cd00251ed5c44 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 14:18:07 -0500 Subject: [PATCH 15/27] Try again Signed-off-by: Adam Li --- clf.joblib | Bin 0 -> 2203 bytes test_tree.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 clf.joblib create mode 100644 test_tree.py diff --git a/clf.joblib b/clf.joblib new file mode 100644 index 0000000000000000000000000000000000000000..098204d8dd2ca62c9808830db98b8c9a26fae4af GIT binary patch literal 2203 zcmdT_Pj3`A6yMFCW;cXoQ$iD|QrjCTg%v4?rY$8Kr6m%hY(y2P;9@*8p7p?tXRtj< zS863HH8e;*RH~z@mwpT`z3?sg5L`HNB7L6mZYHA~IKY89_&wY2{rvpi&u<=0>~Bug z@=wi)2a*%g$p_>`SO_At`t%kPSxWeJD2Tu5M&ZMZd{4YkgbCXMd;ORG{5SopUQ|tw zhmvqu*Qz0Rl9*ry01c5wp@a>sCR`$V{uyf98P2z~%jd7NMYI{=pFz@7zQP0vdaj#7&CdXckQP0b3Ka8jJHjiU?>d7@6n z2}@+cLJxUx;hs9>MVT5(KcUxGUfD9W*yw@? zBG*n!eOS&@oz%cagr_3fa1hLWPr$k=N&e+&g|>7CZD{?SyQ zHla4tCd&|b#-s^gvPil*ec#}@9hSxE0HvY#AOh6co|&$4jt4(+oC3uTsnigen`d7G zh%wV@iuu^sf{~gO6sJ(5@m0M_XjLk(=lZ(q5rpX~hsVJ<>U^Ahpc1^)b`SkrIl4^9K*w;bza4iX5agytcZv zd-m$jgMa>g+UeGj(0{p!oST}f=E}h8Dt<@(2Iv3`wW4Ag8_xln2=s2|LZPO`N z%k#BS(>G1)Ocl>Gd<5Mk literal 0 HcmV?d00001 diff --git a/test_tree.py b/test_tree.py new file mode 100644 index 000000000..9296ef8b2 --- /dev/null +++ b/test_tree.py @@ -0,0 +1,43 @@ +import joblib +import numpy as np + +from sktree.tree import UnsupervisedObliqueDecisionTree + +X_small = np.array( + [ + [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0], + [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1], + [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1], + [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1], + [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1], + [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1], + [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1], + [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], + [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], + [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0], + [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0], + [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0], + [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0], + [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0], + [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1], + [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1], + [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1], + [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1], + [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1], + [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1], + [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1], + [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1], + [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0], + ] +) + +y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] + +for i in range(10): + pickle_path = "./clf.joblib" + clf = UnsupervisedObliqueDecisionTree(random_state=i) + + clf.fit(X_small) + + joblib.dump(clf, pickle_path) + loaded_clf = joblib.load(pickle_path, mmap_mode="r") From fff9154bb0b0c2f94212bdcfdb498dd76f51c954 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 14:21:39 -0500 Subject: [PATCH 16/27] Try? Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_tree.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index f621b6985..6dc52cff5 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -110,6 +110,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): def __getstate__(self): """Getstate re-implementation, for pickling.""" d = {} + print("Setting state...") # capacity is inferred during the __setstate__ using nodes d["max_depth"] = self.max_depth d["node_count"] = self.node_count @@ -118,6 +119,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): proj_vecs = self.get_projection_matrix() d["proj_vecs"] = proj_vecs + print("Finished Setting state...") return d def __setstate__(self, d): @@ -128,7 +130,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): if "nodes" not in d: raise ValueError("You have loaded ObliqueTree version which " "cannot be imported") - + print("Getting state...") node_ndarray = d["nodes"] value_ndarray = d["values"] @@ -156,6 +158,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): self.proj_vec_weights[i].push_back(weight) self.proj_vec_indices[i].push_back(j) + print("Finsihed getting state...") memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), @@ -163,12 +166,14 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" + print("getting proj.") proj_vecs = np.zeros((self.node_count, self.n_features), dtype=np.float64) for i in range(0, self.node_count): for j in range(0, self.proj_vec_weights[i].size()): weight = self.proj_vec_weights[i][j] feat = self.proj_vec_indices[i][j] proj_vecs[i, feat] = weight + print("finished getting proj.") return proj_vecs cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil: From 17d12f57ff7bb43fa03d87f054d71edc972650f3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 14:35:34 -0500 Subject: [PATCH 17/27] Try? Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_tree.pxd | 2 +- test_tree.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd index 5292551b9..4af00f445 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd @@ -32,7 +32,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): SplitRecord* split_node, Node *node, intp_t node_id, - ) nogil except -1 + ) except -1 nogil cdef float32_t _compute_feature( self, const float32_t[:, :] X_ndarray, diff --git a/test_tree.py b/test_tree.py index 9296ef8b2..be863601a 100644 --- a/test_tree.py +++ b/test_tree.py @@ -1,7 +1,7 @@ import joblib import numpy as np -from sktree.tree import UnsupervisedObliqueDecisionTree +from sktree.tree import ObliqueDecisionTreeClassifier, UnsupervisedObliqueDecisionTree X_small = np.array( [ @@ -35,9 +35,11 @@ for i in range(10): pickle_path = "./clf.joblib" - clf = UnsupervisedObliqueDecisionTree(random_state=i) + # clf = UnsupervisedObliqueDecisionTree(random_state=i) + # clf.fit(X_small) - clf.fit(X_small) + clf = ObliqueDecisionTreeClassifier(random_state=i) + clf.fit(X_small, y_small) joblib.dump(clf, pickle_path) loaded_clf = joblib.load(pickle_path, mmap_mode="r") From 7745458016dbf8075c44e03c6c200d6e1aa2d6a2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 15:01:03 -0500 Subject: [PATCH 18/27] Try fix ci Signed-off-by: Adam Li --- .github/workflows/main.yml | 1 + clf.joblib | Bin 2203 -> 0 bytes .../unsupervised/_unsup_oblique_splitter.pyx | 1 + test_tree.py | 2 +- 4 files changed, 3 insertions(+), 1 deletion(-) delete mode 100644 clf.joblib diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8b57bd2c6..adb093b0e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -54,6 +54,7 @@ jobs: sudo apt-get update sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev sudo apt-get install -y gcc + sudo apt-get update - name: Install Ccache for MacOSX if: ${{ matrix.os == 'macos-latest'}} diff --git a/clf.joblib b/clf.joblib deleted file mode 100644 index 098204d8dd2ca62c9808830db98b8c9a26fae4af..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2203 zcmdT_Pj3`A6yMFCW;cXoQ$iD|QrjCTg%v4?rY$8Kr6m%hY(y2P;9@*8p7p?tXRtj< zS863HH8e;*RH~z@mwpT`z3?sg5L`HNB7L6mZYHA~IKY89_&wY2{rvpi&u<=0>~Bug z@=wi)2a*%g$p_>`SO_At`t%kPSxWeJD2Tu5M&ZMZd{4YkgbCXMd;ORG{5SopUQ|tw zhmvqu*Qz0Rl9*ry01c5wp@a>sCR`$V{uyf98P2z~%jd7NMYI{=pFz@7zQP0vdaj#7&CdXckQP0b3Ka8jJHjiU?>d7@6n z2}@+cLJxUx;hs9>MVT5(KcUxGUfD9W*yw@? zBG*n!eOS&@oz%cagr_3fa1hLWPr$k=N&e+&g|>7CZD{?SyQ zHla4tCd&|b#-s^gvPil*ec#}@9hSxE0HvY#AOh6co|&$4jt4(+oC3uTsnigen`d7G zh%wV@iuu^sf{~gO6sJ(5@m0M_XjLk(=lZ(q5rpX~hsVJ<>U^Ahpc1^)b`SkrIl4^9K*w;bza4iX5agytcZv zd-m$jgMa>g+UeGj(0{p!oST}f=E}h8Dt<@(2Iv3`wW4Ag8_xln2=s2|LZPO`N z%k#BS(>G1)Ocl>Gd<5Mk diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 6df258581..0add5b1ad 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -146,6 +146,7 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): """ # call parent reset UnsupervisedSplitter.node_reset(self, start, end, weighted_n_node_samples) + cdef intp_t i # Clear all projection vectors for i in range(self.max_features): diff --git a/test_tree.py b/test_tree.py index be863601a..09160ab76 100644 --- a/test_tree.py +++ b/test_tree.py @@ -33,7 +33,7 @@ y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] -for i in range(10): +for i in range(100): pickle_path = "./clf.joblib" # clf = UnsupervisedObliqueDecisionTree(random_state=i) # clf.fit(X_small) From cec4b244928e4082a93e2ef9eaef7717fae57993 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 15:09:23 -0500 Subject: [PATCH 19/27] Try on sep machine Signed-off-by: Adam Li --- clf.joblib | Bin 0 -> 7049 bytes .../unsupervised/_unsup_oblique_splitter.pyx | 5 +++++ 2 files changed, 5 insertions(+) create mode 100644 clf.joblib diff --git a/clf.joblib b/clf.joblib new file mode 100644 index 0000000000000000000000000000000000000000..bd0b46bb233c80163849b39d3a70355e0dc5d653 GIT binary patch literal 7049 zcmdTJc|4TsdIp2+%UDWCmP9E^wr(R!I*q0&N_Aw+#tdU-%%Uh-Y|%&LBOy!6sk9-I zQ=*0H%C54cA|+I|R7c`|vk9m3JKfv;dOv-EygGCmY3c}O4kgXU%K7c`Ia2Rwd5l4hhjscKD*KkBe0D(%!^NzrGlRoJ* z4)O*>D*!Td9D~bYaye`Wg#*&UT;M=J)sF#XgJ4GI1~7whOe)czMg{?83=dt+V&Q{5 zSa>FLYFeb(TmoMn1Q2{F5Rjsi@H}29cL0J-!84&)h($tc28+bY72FY8lQ_W;ouVsR zLA+vAYcS?uG`I;gMvDXRl^_O#SvLVa{0}sY0}=okJdMWKi3PkhQFN+w*mNbV9Ycrp1F1~tB3Ga?jrvj8rQLlv9{kYp3_ zG(4{;DW;Cn!or4uQIN}k%GA2zR~r;uF(^+{s)iqZweKYg2fCm>3w!|(ybghqF?(9W z@Dc;H368f30bL9}SNLDFzfniV?+>Vs?9G;qzo1%=}J9oyDXAQx?qhNG3BqMl3;d_q*062h{S z5<&q`DECy|nw!mej?$P$x3ZTL;-dVD3vt9WK7D)E+`U8+ATPOegfyWPMFMbrxWv0m+*o#mRT~r-qSl- zrvfXYx)nhf>}U$_d5vmp;6H!jPdmnYdQbxR0F~JsbG$j`9SUieuWFZnMach6pTPbr zp&yU_q;I+(!n03;0aUWN5CLW}Sqwj15ShpZ{}d7h=zB}>viKGd1V12?s-`Ul%BG={ z$Y0SX8H9^(=bO3nWB7k+M`ZU$<$uhZ|9vjec|}m<@l0Gq7eP_HMEKvqzti`3@;|#T zvj6Y;{j+v|R(__xGx49*|KF1TU-j$1b-w?9J;{%SXW$Y5X#y9zfs-Xz~_LBW|L3oF}ON z*G8v~@-Dnl9-3g_GhR#In_!e?-Va3gh~($ObMr>AP>_IDK*PUH*+{`SCz#I7CdXP8 z98N&u&+9lMG-X4M08;DS&24O!S%H)Y0Exp9s2rRxnNDWmp?wjPz>F~kIMKWQP zR;Z-whfcpFWqQ!4wqnvXY*=97*Ay)Ix_V*g^2@h=cjzsy_%hEAhP`Uxd|3?fO8Na^ zadK`|OiRzzBERC}jWN5oXvg66j!^g8bZ)HKho3*Bd!e~6+oOG1wOhXKoQlE2^G{f> zvxb+3db;=ayJnK_4a?nUn6N7EOU<*-Ni>4Tr!gWH6CRHQo1R=&a>}bH-!N6uWOyJG zRgb(9KO}B>Z0(uS3iTc4xrm3kv9U`=s#C+`@>9*bP*VM-<2P#i7qyTxx=!KeSuoxu zq*7zDM-%^0ZR8BYf)8m#B%F-#OxO_pfEb}_X)U&*(&okim!Xcbt-GDeyU#rs=^ozU zec|G2tF1o0_G&QGI%JfX?Jdpo4H5EJ9}_DN84X=5Z(@4xkIJx*uXi(ez3Hs%U!kcUA8g^80QhYvGl_Y!Gt_n1j$*xZL=(?pSHuXi3ZOX!4*;-VQV5#QvbX6Ta(x+ZxgVd#y4G1YM zbx=R-o%EaRH+iH%MRX5sG&>aJT&-^5 zJm>wB18LY$tm>_-ajW;+uWtCTXCi5=DzR&!l1lpZU8un{yLN1qol|~xHdrk1sv@+Uy_C%HZdtDX*jA})_l>a%bqDt6$3BdkTKB2a zw+B2WKi5@eMQt0fPR;1axSUsY*_~v$Va&$drAuMleIH%cyUBBh?c<(B4lgza<`s4a zlabYyoux+#h=T>BoodI!EADVEsD0_GD#;kA97oo*hVFW^gmX+0S9R)EP2(Pgv?tDo z0&vWbFP|f1q9Z&EhkBz8V#B)+Z``j?R#)SZf4u(gW7g(?XYp-KGRbtK^yuq*?!!}q zucU96J6Wy&IcwBtP-m~!-X{tkExNW%uMCtw@6dTV=d`$^XNyadLwp#w)IzLAlQn9% zbF?E((ewIJljxB{d(Xqf%LC_J72E9Y^Qdv~9~b}Grr1f#KBK8SA%CFfY{5yt$Xm;L z&b@6`>r!00#`V`e(8JyCoeFCx{&3<^edIj1oqICpX0X)=9F$*8|Du7zP8@np3ywq1m9>nul>{SM#o4u^t9K-cNZ)fFKPa` z#GV`!H%F%YH7HGzyOHSLqxgoq6!S`FOC9>r#nQBc7xDe5hc#Y!S@{oZ4I)2*cKb>{ z+dH{c9``%4loa@Wp8m4)$>gYCv(gm&js$v`jTx`_rFd_6VE7=d!RF$GN2s%Do~~(- z)yt1w=?9Bf#oXqOV%vD<7|Fr%fR{N?oOHu;VbSR zTrM@&eeHtj$vJFbAtN)S;V!&>O~Ci(balS5i%Bu z=gbex8h(4bQZ=Sw-QDL`J+Hp5LLJuGs$+Nbj>mRS*Lhj&TS`f#;wNW?dsN%beE$K%)OH=;*ZI+z}pfVwkRo< zIy}`pFIgYM3^1tf) zQ~UKwhhSoVkoPc0|B?F^`@sgAXioc-FI#pQeco@^aHL5#=JwraBl@-Ub~W=`0h#p* z3amx>&+_!+_O^da4)n?nJW2IFt#>+gQH@51r{CKIH^+k8>dnDbc|DsySQ@()m908< z^UDg)am$h1JGrIWtdc-UEz34X_`PPYp%mj`7p-vid$*{ptsNcqDh2_;2n}(PO#QvX zLk;(`;&qL0$W8?O`el}M4oyw#E^3!$c;S{Ewua(iYQYPWY@=xpott#xN60c7hoPVI z{q<^(99e_5pU?yvsQh!rGM|>nCxx^`*CIzJ?qGw@2+7AFEsA5(N^+iBwlgVkotPJ?ujAt%KDzUwT>}S zo#w3>>{n*Pu~p&eUX5)!xSIw8HH4?Y!?jMNlJynOAL$jOSz&L_o;a~e>Tcoq=eHPEX4?=Mx|=Y)zFUC@|SDr25! zzN4yMZxw1uhvb8*-kkQQ|6G^5!2H)PIfZ;_k{9Katx6vjX~ihi%YET}peS~BCG(BG zvQpjEy%@z$@68Gih0o8`9|%2hZQrNrWeeNXdxo;iV!PM&*nUX%7T#L{aV7)$%L2{? xW>N5LoG+fshEN!Q5UeKevRndY@{awiI{{qlb8y^4w literal 0 HcmV?d00001 diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 0add5b1ad..51cd8eb4a 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -283,6 +283,9 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # instantiate the split records _init_split(&best_split, end) + with gil: + print("Splitting...") + # Sample the projection matrix self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) @@ -355,6 +358,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): best_split = current_split # copy + with gil: + print("Trying to reorg...") # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] if best_split.pos < end: partition_end = end From 7ff2ead179a4d29e0a78d783af67454318e736f2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 15:51:54 -0500 Subject: [PATCH 20/27] Try again Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_tree.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 6dc52cff5..2fd5ad0a7 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -225,6 +225,9 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # https://www.codementor.io/@arpitbhayani/powering-inheritance-in-c-using-structure-composition-176sygr724 cdef ObliqueSplitRecord* oblique_split_node = (split_node) node_id = self.node_count + with gil: + print("Trying to set split nodes...") + node.feature = deref(oblique_split_node).feature node.threshold = deref(oblique_split_node).threshold @@ -232,6 +235,9 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # inside the tree itself self.proj_vec_weights[node_id] = deref(oblique_split_node).proj_vec_weights self.proj_vec_indices[node_id] = deref(oblique_split_node).proj_vec_indices + + with gil: + print("Finished setting for ", node_id) return 1 cdef float32_t _compute_feature( From 031d653422148b1968e40f338ce131bd3d389eaa Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 16:03:25 -0500 Subject: [PATCH 21/27] Try again Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_splitter.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 51cd8eb4a..bd1e8c07c 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -389,9 +389,10 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights # deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + print("About to set weights") deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] - + print("Finished setting everything") # Return values deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos From 600d4d1013a50a9e05d91b7eb54081ea80ef101a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 16:13:28 -0500 Subject: [PATCH 22/27] Try again Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_splitter.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index bd1e8c07c..ace0dfd95 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -389,10 +389,12 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights # deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - print("About to set weights") + with gil: + print("About to set weights") deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] - print("Finished setting everything") + with gil: + print("Finished setting everything") # Return values deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos From 657b641a2940dcc71a9ea5cff575a5a264f5d362 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 16:21:20 -0500 Subject: [PATCH 23/27] Try again Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_splitter.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index ace0dfd95..869d199a5 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -391,6 +391,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices with gil: print("About to set weights") + print(best_split.feature) + print(self.proj_mat_weights.size(), self.proj_mat_indices.size()) deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] with gil: From 261b1f9bbd5df11766b268a42d3da752aed9bc4b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 16:27:57 -0500 Subject: [PATCH 24/27] Try again Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_splitter.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 869d199a5..8f8771b0e 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -393,6 +393,9 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): print("About to set weights") print(best_split.feature) print(self.proj_mat_weights.size(), self.proj_mat_indices.size()) + with gil: + print(deref(oblique_split).proj_vec_indices.size(), deref(oblique_split).proj_vec_weights.size()) + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] with gil: From 74d31bcef471505cbbf34886e2e602f24d51b7ef Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 29 Feb 2024 16:33:03 -0500 Subject: [PATCH 25/27] Try again Signed-off-by: Adam Li --- sktree/tree/unsupervised/_unsup_oblique_splitter.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 8f8771b0e..ce0417849 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -393,6 +393,7 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): print("About to set weights") print(best_split.feature) print(self.proj_mat_weights.size(), self.proj_mat_indices.size()) + print(self.proj_mat_indices[best_split.feature].size(), self.proj_mat_weights[best_split.feature].size()) with gil: print(deref(oblique_split).proj_vec_indices.size(), deref(oblique_split).proj_vec_weights.size()) From ec503b4e5a9be23a8f369a163117960df0b555d9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 5 Mar 2024 16:59:51 -0500 Subject: [PATCH 26/27] Loose file Signed-off-by: Adam Li --- sktree/tree/_projection.pxd | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 sktree/tree/_projection.pxd diff --git a/sktree/tree/_projection.pxd b/sktree/tree/_projection.pxd new file mode 100644 index 000000000..de171cb20 --- /dev/null +++ b/sktree/tree/_projection.pxd @@ -0,0 +1,8 @@ +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t + + +cpdef sample_projection_matrix(intp_t max_features, ): + # Extract input + cdef float32_t[:] X_data = X.data + cdef int32_t[:] X_indices = X.indices + cdef int32_t[:] X_indptr = X.indptr From 2af16b418b06cb3775a332b6baaba5b8a4ba6fa2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 6 Mar 2024 15:37:52 -0500 Subject: [PATCH 27/27] Update numpy Signed-off-by: Adam Li --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6d684edff..c7a450cba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ include = [ {path = "meson.build", format = "sdist"} ] dependencies = [ - 'numpy', + 'numpy>=1.25.0', 'scipy>=1.5.0', 'scikit-learn>=1.4.1' ] @@ -71,7 +71,7 @@ build = [ 'scikit-learn>=1.4.1', 'Cython>=3.0.8', 'ninja', - 'numpy', + 'numpy>=1.25.0', 'rich-click', 'pydevtool' ]