diff --git a/.circleci/config.yml b/.circleci/config.yml index acba98a00..e4132be70 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -81,12 +81,14 @@ jobs: - run: name: setup Python venv command: | - pip install -r build_requirements.txt - pip install -r doc_requirements.txt + pip install --upgrade pip spin + spin setup-submodule + pip install .[build] + pip install .[doc] - run: name: build scikit-tree command: | - python spin build -j2 + spin build -j2 - save_cache: key: deps_ccache-{{ .Branch }} @@ -127,7 +129,7 @@ jobs: echo "After..." ls build-install/** export PYTHONPATH=$PWD/build-install/usr/lib/python3.8/site-packages - python ./spin docs + ./spin docs - store_artifacts: path: doc/_build/html diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 45db9bf20..1db9bae92 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -60,9 +60,10 @@ jobs: - name: Setup submodule run: | - python -m pip install -r build_requirements.txt + python -m pip install --upgrade pip python -m pip install spin - python spin setup-submodule + python -m spin setup-submodule + python -m pip install .[build] - name: win_amd64 - install rtools run: | @@ -79,7 +80,7 @@ jobs: # configuration step to create a build directory. The subsequent wheel # build then needs to use that directory. This can be done with pip # using a command like: - # python -m pip wheel --config-settings builddir=build . + # pip wheel --config-settings builddir=build . # if: >- # ( ! contains(matrix.os[2], 'arm64' ) ) env: @@ -114,10 +115,10 @@ jobs: - name: Install cibuildwheel dependencies run: | - python -m pip install cibuildwheel - python -m pip install -r build_requirements.txt - python -m pip install spin - python spin setup-submodule + pip install cibuildwheel + pip install spin + spin setup-submodule + pip install .[build] - name: Build wheels uses: pypa/cibuildwheel@v2.16.5 @@ -146,9 +147,9 @@ jobs: - name: Build source distribution run: | - python -m pip install -r build_requirements.txt - python -m pip install spin - python spin setup-submodule + pip install spin build + spin setup-submodule + pip install .[build] python -m build - name: Store artifacts diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 80bcb739e..adb093b0e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,28 +46,32 @@ jobs: python-version: ${{ matrix.python-version }} architecture: "x64" cache: "pip" - cache-dependency-path: "requirements.txt" + cache-dependency-path: "pyproject.toml" - - name: show-gcc + - name: Install packages for Ubuntu + if: ${{ matrix.os == 'ubuntu-22.04'}} run: | - gcc --version + sudo apt-get update + sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev + sudo apt-get install -y gcc + sudo apt-get update - name: Install Ccache for MacOSX if: ${{ matrix.os == 'macos-latest'}} run: | brew install ccache + brew install gcc - - name: Install packages for Ubuntu - if: ${{ matrix.os == 'ubuntu-22.04'}} + - name: show-gcc run: | - sudo apt-get update - sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev + gcc --version - name: Install Python packages run: | - python -m pip install -r build_requirements.txt python -m pip install spin - python -m pip install -r test_requirements.txt + python -m spin setup-submodule + python -m pip install .[build] + python -m pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -153,7 +157,7 @@ jobs: python-version: ${{ matrix.python-version }} architecture: "x64" cache: "pip" - cache-dependency-path: "requirements.txt" + cache-dependency-path: "pyproject.toml" - name: show-gcc run: | @@ -172,9 +176,10 @@ jobs: - name: Install Python packages run: | - python -m pip install -r build_requirements.txt python -m pip install spin - python -m pip install -r test_requirements.txt + python -m spin setup-submodule + python -m pip install .[build] + python -m pip install .[test] - name: Prepare compiler cache id: prep-ccache @@ -232,7 +237,7 @@ jobs: - name: Upload coverage stats to codecov uses: codecov/codecov-action@v4 with: - # python spin goes into the INSTALLED path in order to run pytest + # spin goes into the INSTALLED path in order to run pytest files: ./coverage.xml fail_ci_if_error: true verbose: true @@ -258,7 +263,7 @@ jobs: # python-version: "3.10" # architecture: "x64" # cache: "pip" -# cache-dependency-path: "requirements.txt" +# cache-dependency-path: "pyproject.toml" # - name: install-rtools # run: | @@ -287,8 +292,8 @@ jobs: # - name: pip-packages # run: | -# pip install -r build_requirements.txt -# pip install -r test_requirements.txt +# pip install .[build] +# pip install .[test] # pip install spin # pip install numpy==1.22.4 @@ -319,8 +324,8 @@ jobs: # run: | # echo "SCIPY_USE_PROPACK=1" >> $env:GITHUB_ENV # echo "FORCE_SUBMODULE=True" >> $env:GITHUB_ENV -# python spin setup_submodule --forcesubmodule -# python spin build -j 2 +# spin setup_submodule --forcesubmodule +# spin build -j 2 # # Necessary because GitHub Actions checks out the repo to D:\ while OpenBLAS # # got installed to C:\ higher up. The copying with `--win-cp-openblas` fails # # when things are split over drives. @@ -329,4 +334,4 @@ jobs: # - name: test # run: | -# python spin test +# spin test diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 60ef6fe0f..fd574cde2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,8 +32,8 @@ jobs: architecture: "x64" - name: Install dependencies run: | - python -m pip install --progress-bar off --upgrade pip - python -m pip install --progress-bar off build twine + pip install --progress-bar off --upgrade pip + pip install --progress-bar off build twine - name: Prepare environment run: | echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 30173a454..b9369698d 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -41,7 +41,10 @@ jobs: sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev - name: Install dependencies - run: pip install -r style_requirements.txt + run: | + pip install --upgrade pip spin + spin setup-submodule + pip install .[style] # check formatting of the code style - name: Check code formatting diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 73c6c70d5..6278bbdb4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -93,7 +93,7 @@ When you're ready to contribute code to address an open issue, please follow the Once your virtual environment is activated, you can install your local clone in "editable mode" with - pip install -r build_requirements.txt + pip install .[build] pip install -e . The "editable mode" comes from the `-e` argument to `pip`, and essential just creates a symbolic link from the site-packages directory of your virtual environment to the source code in your local clone. That way any changes you make will be immediately reflected in your virtual environment. diff --git a/DEVELOPING.md b/DEVELOPING.md index 3f2e645fe..6103f340e 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -39,14 +39,14 @@ Assuming these steps have worked properly and you have read and followed any nec If you are developing locally, you will need the build dependencies to compile the Cython / C++ code: - pip install -r build_requirements.txt + pip install .[build] Other requirements can be installed as such: - pip install -r requirements.txt - pip install -r style_requirements.txt - pip install -r test_requirements.txt - pip install -r doc_requirements.txt + pip install . + pip install .[style] + pip install .[test] + pip install .[doc] # Building the project from source diff --git a/README.md b/README.md index e8c767478..68e2ce946 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Building locally with Meson (For developers) Make sure you have the necessary packages installed # install build dependencies - pip install -r build_requirements.txt + pip install .[build] # you may need these optional dependencies to build scikit-learn locally conda install -c conda-forge joblib threadpoolctl pytest compilers llvm-openmp diff --git a/clf.joblib b/clf.joblib new file mode 100644 index 000000000..bd0b46bb2 Binary files /dev/null and b/clf.joblib differ diff --git a/doc/install.rst b/doc/install.rst index 67edeeb23..099693b85 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -41,7 +41,7 @@ Then run installation of build packages .. code-block:: bash - pip install -r build_requirements.txt + pip install .[build] pip install spin # use spin CLI to run Meson build locally diff --git a/pyproject.toml b/pyproject.toml index ed0e197d9..c7a450cba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] build-backend = "mesonpy" requires = [ - "meson-python>=0.13.0", + "meson-python>=0.15.0", 'ninja', # `wheel` is needed for non-isolated builds, given that `meson-python` # doesn't list it as a runtime requirement (at least in 0.10.0) @@ -9,8 +9,8 @@ requires = [ "wheel", "setuptools<=65.5", "packaging", - "Cython>=0.29.24", - "scikit-learn>=1.4", + "Cython>=3.0.8", + "scikit-learn>=1.4.1", "scipy>=1.5.0", "numpy>=1.25; python_version>='3.9'" ] @@ -49,9 +49,9 @@ include = [ {path = "meson.build", format = "sdist"} ] dependencies = [ - 'numpy', + 'numpy>=1.25.0', 'scipy>=1.5.0', - 'scikit-learn>=1.3.1' + 'scikit-learn>=1.4.1' ] [project.optional-dependencies] @@ -68,10 +68,10 @@ build = [ 'meson-python', 'spin', 'doit', - 'scikit-learn>=1.3.1', - 'Cython>=0.29.36', + 'scikit-learn>=1.4.1', + 'Cython>=3.0.8', 'ninja', - 'numpy', + 'numpy>=1.25.0', 'rich-click', 'pydevtool' ] diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 94517a38a..db5b137c1 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 94517a38a6354ee02ef715d1077c8ec6d1713d3b +Subproject commit db5b137c1c1d2cb90aed2354dcb5b933e1df803b diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 15feaea22..16678a492 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -31,8 +31,12 @@ cdef struct ObliqueSplitRecord: float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. - vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + # XXX: By storing the values, they will have to be copied into the split record + # which may be inefficient. + # Inside the tree, they will also again be copied into a Node, which will + # in total incur two copies, whereas we ideally only need one copy. + vector[float32_t] proj_vec_weights # weights of the vector (max_features,) + vector[intp_t] proj_vec_indices # indices of the features (max_features,) cdef class BaseObliqueSplitter(Splitter): @@ -43,7 +47,7 @@ cdef class BaseObliqueSplitter(Splitter): cdef vector[vector[float32_t]] proj_mat_weights # nonzero weights of sparse proj_mat matrix cdef vector[vector[intp_t]] proj_mat_indices # nonzero indices of sparse proj_mat matrix - # TODO: assumes all oblique splitters only work with dense data + # TODO: assumes all oblique splitters only work with dense data; make work with sparse arrays too cdef const float32_t[:, :] X # feature weights across (n_dims,) @@ -72,8 +76,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil cdef int node_split( diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index aee43b1ce..264deea36 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -100,8 +100,8 @@ cdef class BaseObliqueSplitter(Splitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -115,8 +115,8 @@ cdef class BaseObliqueSplitter(Splitter): # Compute linear combination of features and then # sort samples according to the feature values. for jdx in range(0, proj_vec_indices.size()): - col_idx = deref(proj_vec_indices)[jdx] - col_weight = deref(proj_vec_weights)[jdx] + col_idx = proj_vec_indices[jdx] + col_weight = proj_vec_weights[jdx] for idx in range(start, end): # initialize the feature value to 0 @@ -322,12 +322,13 @@ cdef class BestObliqueSplitter(ObliqueSplitter): cdef intp_t feat_i, p # index over computed features and start/end cdef intp_t partition_end - cdef float32_t temp_d # to compute a projection feature value + cdef float32_t temp_d # to compute a projection feature value # instantiate the split records _init_split(&best_split, end) - # Sample the projection matrix + # Sample the projection matrix by passing in a reference to the underlying + # vector of vectors for weights and indices self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) # For every vector in the projection matrix @@ -336,11 +337,9 @@ cdef class BestObliqueSplitter(ObliqueSplitter): if self.proj_mat_weights[feat_i].empty(): continue - # XXX: 'feature' is not actually used in oblique split records - # Just indicates which split was sampled + # Note: 'feature' is not actually used in oblique split records + # Just indicates which index was sampled in the sampled projection matrix current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] # Compute linear combination of features and then # sort samples according to the feature values. @@ -349,8 +348,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): end, samples, feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] + self.proj_mat_weights[feat_i], + self.proj_mat_indices[feat_i] ) # Sort the samples @@ -395,6 +394,14 @@ cdef class BestObliqueSplitter(ObliqueSplitter): best_split = current_split # copy + # Note: we do not make a copy above if we are not going to use it + # as the candidate best split, so here we create a pointer to the + # copy of the projection vectors + # best_split.proj_vec_weights = self.proj_mat_weights[feat_i] + # best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + # best_proj_vec_weights = &self.proj_mat_weights[feat_i] + # best_proj_vec_indices = &self.proj_mat_indices[feat_i] + # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] if best_split.pos < end: partition_end = end @@ -403,9 +410,12 @@ cdef class BestObliqueSplitter(ObliqueSplitter): while p < partition_end: # Account for projection vector temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] + # for j in range(best_split.proj_vec_indices.size()): + # # temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ + # # best_split.proj_vec_weights[j] + for j in range(self.proj_mat_weights[best_split.feature].size()): + temp_d += self.X[samples[p], self.proj_mat_indices[best_split.feature][j]] *\ + self.proj_mat_weights[best_split.feature][j] if temp_d <= best_split.threshold: p += 1 @@ -422,9 +432,14 @@ cdef class BestObliqueSplitter(ObliqueSplitter): best_split.improvement = self.criterion.impurity_improvement( impurity, best_split.impurity_left, best_split.impurity_right) - # Return values - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + # Ensure that the projection vectors are copied into the underlying split record that is + # seen by the tree builder + # deref(oblique_split).proj_vec_indices = deref(best_proj_vec_indices) # best_split.proj_vec_indices + # deref(oblique_split).proj_vec_weights = deref(best_proj_vec_weights) # best_split.proj_vec_weights + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] + deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] + + # Dereference the pointer to the split record and set the values here deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos deref(oblique_split).threshold = best_split.threshold @@ -560,11 +575,9 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): if self.proj_mat_weights[feat_i].empty(): continue - # XXX: 'feature' is not actually used in oblique split records - # Just indicates which split was sampled + # Note: 'feature' is not actually used in oblique split records + # Just indicates which index was sampled in the sampled projection matrix current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] # Compute linear combination of features self.compute_features_over_samples( @@ -572,8 +585,8 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): end, samples, feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] + self.proj_mat_weights[feat_i], + self.proj_mat_indices[feat_i] ) # find min, max of the feature_values @@ -618,6 +631,15 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): best_proxy_improvement = current_proxy_improvement best_split = current_split # copy + # Note: we do not make a copy above if we are not going to use it + # as the candidate best split + # The self.proj_mat_weights and self.proj_mat_indices already contain + # the best projection vector found at `best_split.feature` (i.e. feat_i). + # best_split.proj_vec_weights = self.proj_mat_weights[feat_i] + # best_split.proj_vec_indices = self.proj_mat_indices[feat_i] + # best_proj_vec_indices = &self.proj_mat_indices[feat_i] + # best_proj_vec_weights = &self.proj_mat_weights[feat_i] + n_visited_features += 1 # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] @@ -628,9 +650,12 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): while p < partition_end: # Account for projection vector temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] + # for j in range(best_split.proj_vec_indices.size()): + # temp_d += self.X[samples[p], best_split.proj_vec_indices[j]] *\ + # best_split.proj_vec_weights[j] + for j in range(self.proj_mat_indices[best_split.feature].size()): + temp_d += self.X[samples[p], self.proj_mat_indices[best_split.feature][j]] *\ + self.proj_mat_weights[best_split.feature][j] if temp_d <= best_split.threshold: p += 1 @@ -647,9 +672,12 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): best_split.improvement = self.criterion.impurity_improvement( impurity, best_split.impurity_left, best_split.impurity_right) + # deref(oblique_split).proj_vec_indices = deref(best_proj_vec_indices) # best_split.proj_vec_indices + # deref(oblique_split).proj_vec_weights = deref(best_proj_vec_weights) # best_split.proj_vec_weights + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] + deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] + # Return values - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos deref(oblique_split).threshold = best_split.threshold @@ -830,186 +858,6 @@ cdef class MultiViewSplitter(BestObliqueSplitter): if proj_i >= self.max_features: break -# XXX: not used right now -cdef class MultiViewObliqueSplitter(BestObliqueSplitter): - def __cinit__( - self, - Criterion criterion, - intp_t max_features, - intp_t min_samples_leaf, - float64_t min_weight_leaf, - object random_state, - const cnp.int8_t[:] monotonic_cst, - float64_t feature_combinations, - const intp_t[:] feature_set_ends, - intp_t n_feature_sets, - bint uniform_sampling, - *argv - ): - self.feature_set_ends = feature_set_ends - self.uniform_sampling = uniform_sampling - - # infer the number of feature sets - self.n_feature_sets = n_feature_sets - - def __reduce__(self): - """Enable pickling the splitter.""" - return (type(self), - ( - self.criterion, - self.max_features, - self.min_samples_leaf, - self.min_weight_leaf, - self.random_state, - self.monotonic_cst.base if self.monotonic_cst is not None else None, - self.feature_combinations, - self.feature_set_ends, - self.n_feature_sets, - self.uniform_sampling, - ), self.__getstate__()) - - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, - ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - - self.X = X - - # create a helper array for allowing efficient Fisher-Yates - self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) - - cdef intp_t i_feature = 0 - cdef intp_t feature_set_begin = 0 - cdef intp_t size_of_feature_set - cdef intp_t ifeat = 0 - cdef intp_t iproj = 0 - while iproj < self.max_features: - for i_feature in range(self.n_feature_sets): - size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin - - for ifeat in range(size_of_feature_set): - self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin + (iproj * self.n_features)) - iproj += 1 - if iproj >= self.max_features: - break - if iproj >= self.max_features: - break - - feature_set_begin = self.feature_set_ends[i_feature] - return 0 - - cdef void sample_proj_mat( - self, - vector[vector[float32_t]]& proj_mat_weights, - vector[vector[intp_t]]& proj_mat_indices - ) noexcept nogil: - """Sample projection matrix accounting for multi-views. - - This proceeds as a normal sampling projection matrix, - but now also uniformly samples features from each feature set. - """ - cdef intp_t n_features = self.n_features - cdef intp_t n_non_zeros = self.n_non_zeros - cdef UINT32_t* random_state = &self.rand_r_state - - cdef intp_t i, j, feat_i, proj_i, rand_vec_index - cdef float32_t weight - - # construct an array to sample from mTry x n_features set of indices - cdef vector[intp_t] indices_to_sample - cdef intp_t grid_size - - # compute the number of features in each feature set - cdef intp_t n_features_in_set - - # keep track of the beginning and ending indices of each feature set - cdef intp_t feature_set_begin, feature_set_end, idx - feature_set_begin = 0 - - # keep track of number of features sampled relative to n_non_zeros - cdef intp_t ifeature = 0 - - if self.uniform_sampling: - # 01: This algorithm samples features from each feature set uniformly and combines them - # into one sparse projection vector. - while ifeature < n_non_zeros: - for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - n_features_in_set = feature_set_end - feature_set_begin - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() - - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - - # sample a n_non_zeros matrix for each feature set, which proceeds by: - # - sample 'n_non_zeros' in a mtry X n_features projection matrix - # - which consists of +/- 1's chosen at a 1/2s rate - # for i in range(0, n_non_zeros_per_set): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[0] - - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features - feat_i = rand_vec_index % n_features - - # sample a random weight - weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - - # the new beginning is the previous end - feature_set_begin = feature_set_end - - ifeature += 1 - else: - # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates - # them independently. - feature_set_begin = 0 - - # sample from a feature set - for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - n_features_in_set = feature_set_end - feature_set_begin - - # indices to sample is a 1D-index array of size (max_features * n_features_in_set) - # which is Fisher-Yates shuffled to sample random features in each feature set - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() - - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - - for i in range(0, n_non_zeros): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[i] - - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features - feat_i = rand_vec_index % n_features - - # sample a random weight - weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - - # the new beginning is the previous end - feature_set_begin = feature_set_end - cdef class MultiViewSplitterTester(MultiViewSplitter): """A class to expose a Python interface for testing.""" diff --git a/sktree/tree/_oblique_tree.pxd b/sktree/tree/_oblique_tree.pxd index 50c7949bf..d4597dbdb 100644 --- a/sktree/tree/_oblique_tree.pxd +++ b/sktree/tree/_oblique_tree.pxd @@ -21,7 +21,7 @@ from ._oblique_splitter cimport ObliqueSplitRecord cdef class ObliqueTree(Tree): cdef vector[vector[float32_t]] proj_vec_weights # (capacity, n_features) array of projection vectors - cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors + cdef vector[vector[intp_t]] proj_vec_indices # (capacity, n_features) array of projection vectors # overridden methods cdef int _resize_c( diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 2b0d16d94..3e43b14cc 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -138,6 +138,7 @@ cdef class ObliqueTree(Tree): proj_vecs = self.get_projection_matrix() d["proj_vecs"] = proj_vecs + return d def __setstate__(self, d): @@ -243,14 +244,13 @@ cdef class ObliqueTree(Tree): node.feature = deref(oblique_split_node).feature node.threshold = deref(oblique_split_node).threshold - # oblique trees store the projection indices and weights - # inside the tree itself - self.proj_vec_weights[node_id] = deref( - deref(oblique_split_node).proj_vec_weights - ) - self.proj_vec_indices[node_id] = deref( - deref(oblique_split_node).proj_vec_indices - ) + # TODO: this is not efficient. There are currently two copies being done of the optimal projection vector. + # One here and one within the splitter. We should try to remove one of these copies in a new design. + # oblique trees store the projection indices and weights inside the tree itself + # Note: this makes a copy of the projection indices and weights by + # dereferencing the pointer to the split record to get the actual values + self.proj_vec_weights[node_id] = deref(oblique_split_node).proj_vec_weights + self.proj_vec_indices[node_id] = deref(oblique_split_node).proj_vec_indices return 1 cdef float32_t _compute_feature( diff --git a/sktree/tree/_projection.pxd b/sktree/tree/_projection.pxd new file mode 100644 index 000000000..de171cb20 --- /dev/null +++ b/sktree/tree/_projection.pxd @@ -0,0 +1,8 @@ +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int32_t, intp_t, uint32_t + + +cpdef sample_projection_matrix(intp_t max_features, ): + # Extract input + cdef float32_t[:] X_data = X.data + cdef int32_t[:] X_indices = X.indices + cdef int32_t[:] X_indptr = X.indptr diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index 2081ab852..7d471bfe4 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -11,7 +11,6 @@ cimport numpy as cnp cnp.import_array() -from cython.operator cimport dereference as deref from libcpp.vector cimport vector from ..._lib.sklearn.tree._criterion cimport Criterion @@ -411,8 +410,8 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -433,13 +432,13 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): feature_values[idx] = 0 for jdx in range(0, proj_vec_indices.size()): feature_values[idx] += self.X[ - samples[idx], deref(proj_vec_indices)[jdx] - ] * deref(proj_vec_weights)[jdx] + samples[idx], proj_vec_indices[jdx] + ] * proj_vec_weights[jdx] if self.feature_weight is not None: # gets the feature weight for this specific column from X # the default of feature_weights[i] is (1/n_features) for all i - patch_weight += self.feature_weight[samples[idx], deref(proj_vec_indices)[jdx]] + patch_weight += self.feature_weight[samples[idx], proj_vec_indices[jdx]] if self.feature_weight is not None: feature_values[idx] /= patch_weight diff --git a/sktree/tree/tests/meson.build b/sktree/tree/tests/meson.build index c88eeda7c..a35b9fd84 100644 --- a/sktree/tree/tests/meson.build +++ b/sktree/tree/tests/meson.build @@ -7,6 +7,7 @@ python_sources = [ 'test_all_trees.py', 'test_unsupervised_tree.py', 'test_multiview.py', + 'test_oblique_trees.py', ] py3.install_sources( diff --git a/sktree/tree/tests/test_oblique_trees.py b/sktree/tree/tests/test_oblique_trees.py new file mode 100644 index 000000000..45f87eaa8 --- /dev/null +++ b/sktree/tree/tests/test_oblique_trees.py @@ -0,0 +1,34 @@ +import numpy as np + +from sktree import HonestForestClassifier +from sktree.tree import MultiViewDecisionTreeClassifier + + +def test_oblique_tree(): + """Test regression reported in https://github.com/neurodata/scikit-tree/issues/215.""" + n, a = ( + 10, + 20, + ) + x = np.random.normal(size=(n, a)) + y = np.random.binomial(1, 0.5, size=(n)) + + for seed in range(100): + # est = MultiViewDecisionTreeClassifier( + # max_features=0.3, + # feature_set_ends=[15, 20], + # random_state=seed, + # ) + + est = HonestForestClassifier( + n_estimators=10, + max_features=0.3, + feature_set_ends=[15, 20], + # bootstrap=True, + # max_samples=1.6, + tree_estimator=MultiViewDecisionTreeClassifier(), + random_state=seed, + n_jobs=-1, + ) + + est.fit(x, y) diff --git a/sktree/tree/tests/test_tree.py b/sktree/tree/tests/test_tree.py index 94cdfef62..1eb223c40 100644 --- a/sktree/tree/tests/test_tree.py +++ b/sktree/tree/tests/test_tree.py @@ -259,15 +259,18 @@ def test_oblique_tree_sampling(Tree, random_state=0): assert rc_cv_scores.mean() > 0.91 +@pytest.mark.parametrize("splitter", ["best", "random"]) @pytest.mark.parametrize("Tree", OBLIQUE_TREES.values()) -def test_oblique_trees_feature_combinations_less_than_n_features(Tree): +def test_oblique_trees_feature_combinations_less_than_n_features(Tree, splitter): """Test the hyperparameter ``feature_combinations`` behaves properly.""" X, y = iris.data[:5, :], iris.target[:5, ...] _, n_features = X.shape # asset that the feature combinations is less than the number of features - estimator = ObliqueDecisionTreeClassifier(random_state=0, feature_combinations=3) + estimator = ObliqueDecisionTreeClassifier( + splitter=splitter, random_state=0, feature_combinations=3 + ) estimator.fit(X, y) assert estimator.feature_combinations_ < n_features @@ -280,8 +283,9 @@ def test_oblique_trees_feature_combinations_less_than_n_features(Tree): assert estimator.feature_combinations_ < n_features +@pytest.mark.parametrize("splitter", ["best", "random"]) @pytest.mark.parametrize("Tree", OBLIQUE_TREES.values()) -def test_oblique_trees_feature_combinations(Tree): +def test_oblique_trees_feature_combinations(Tree, splitter): """Test the hyperparameter ``feature_combinations`` behaves properly.""" if is_classifier(Tree): @@ -296,27 +300,27 @@ def test_oblique_trees_feature_combinations(Tree): with pytest.raises( RuntimeError, match=f"Feature combinations {n_features + 1} should not be greater" ): - estimator = Tree(random_state=0, feature_combinations=n_features + 1) + estimator = Tree(splitter=splitter, random_state=0, feature_combinations=n_features + 1) estimator.fit(X, y) # asset that the feature combinations is less than the number of features - estimator = Tree(random_state=0, feature_combinations=3) + estimator = Tree(splitter=splitter, random_state=0, feature_combinations=3) estimator.fit(X, y) assert estimator.feature_combinations_ < n_features # default option should make it 1.5 if n_features > 1.5 - estimator = Tree(random_state=0) + estimator = Tree(splitter=splitter, random_state=0) estimator.fit(X, y) assert estimator.feature_combinations_ == 1.5 # setting the feature combinations explicitly is fine as long as it is < n_features - estimator = Tree(random_state=0, feature_combinations=3) + estimator = Tree(splitter=splitter, random_state=0, feature_combinations=3) estimator.fit(X, y) assert estimator.feature_combinations_ == 3 # edge-case of only a single feature should set feature_combinations properly X = X[:, 0:1] - estimator = Tree(random_state=0) + estimator = Tree(splitter=splitter, random_state=0) estimator.fit(X, y) assert estimator.feature_combinations_ == 1 @@ -524,7 +528,9 @@ def test_balance_property(criterion, Tree): X, y = diabetes.data, diabetes.target reg = Tree(criterion=criterion) reg.fit(X, y) - assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) + assert np.sum(reg.predict(X)) == pytest.approx( + np.sum(y) + ), f"Failed with {Tree} and {criterion}: {np.sum(reg.predict(X))} != {np.sum(y)}" @pytest.mark.parametrize("Tree", ALL_TREES.values()) diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd index 1e65a49b1..e8dce72ba 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pxd @@ -20,8 +20,8 @@ cdef struct ObliqueSplitRecord: float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. - vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t] proj_vec_weights # weights of the vector (max_features,) + vector[intp_t] proj_vec_indices # indices of the features (max_features,) cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): @@ -73,6 +73,6 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil diff --git a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx index 53b2bbd43..ce0417849 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_splitter.pyx @@ -146,6 +146,7 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): """ # call parent reset UnsupervisedSplitter.node_reset(self, start, end, weighted_n_node_samples) + cdef intp_t i # Clear all projection vectors for i in range(self.max_features): @@ -172,8 +173,8 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): intp_t end, const intp_t[:] samples, float32_t[:] feature_values, - vector[float32_t]* proj_vec_weights, # weights of the vector (max_features,) - vector[intp_t]* proj_vec_indices # indices of the features (max_features,) + vector[float32_t]& proj_vec_weights, # weights of the vector (max_features,) + vector[intp_t]& proj_vec_indices # indices of the features (max_features,) ) noexcept nogil: """Compute the feature values for the samples[start:end] range. @@ -187,8 +188,8 @@ cdef class UnsupervisedObliqueSplitter(UnsupervisedSplitter): # Compute linear combination of features and then # sort samples according to the feature values. for jdx in range(0, proj_vec_indices.size()): - col_idx = deref(proj_vec_indices)[jdx] - col_weight = deref(proj_vec_weights)[jdx] + col_idx = proj_vec_indices[jdx] + col_weight = proj_vec_weights[jdx] for idx in range(start, end): # initialize the feature value to 0 @@ -282,6 +283,9 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # instantiate the split records _init_split(&best_split, end) + with gil: + print("Splitting...") + # Sample the projection matrix self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) @@ -294,8 +298,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): # XXX: 'feature' is not actually used in oblique split records because it normally indicates the column # Just indicates which split was sampled current_split.feature = feat_i - current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] - current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] + # current_split.proj_vec_weights = &self.proj_mat_weights[feat_i] + # current_split.proj_vec_indices = &self.proj_mat_indices[feat_i] # Compute linear combination of features and then # sort samples according to the feature values. @@ -304,8 +308,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): end, samples, feature_values, - &self.proj_mat_weights[feat_i], - &self.proj_mat_indices[feat_i] + self.proj_mat_weights[feat_i], + self.proj_mat_indices[feat_i] ) # Sort the samples @@ -354,6 +358,8 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): best_split = current_split # copy + with gil: + print("Trying to reorg...") # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] if best_split.pos < end: partition_end = end @@ -362,9 +368,9 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): while p < partition_end: # Account for projection vector temp_d = 0.0 - for j in range(best_split.proj_vec_indices.size()): - temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ - deref(best_split.proj_vec_weights)[j] + for j in range(self.proj_mat_weights[best_split.feature].size()): + temp_d += self.X[samples[p], self.proj_mat_indices[best_split.feature][j]] *\ + self.proj_mat_weights[best_split.feature][j] if temp_d <= best_split.threshold: p += 1 @@ -381,9 +387,21 @@ cdef class BestObliqueUnsupervisedSplitter(UnsupervisedObliqueSplitter): best_split.improvement = self.criterion.impurity_improvement( impurity, best_split.impurity_left, best_split.impurity_right) + # deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights + # deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices + with gil: + print("About to set weights") + print(best_split.feature) + print(self.proj_mat_weights.size(), self.proj_mat_indices.size()) + print(self.proj_mat_indices[best_split.feature].size(), self.proj_mat_weights[best_split.feature].size()) + with gil: + print(deref(oblique_split).proj_vec_indices.size(), deref(oblique_split).proj_vec_weights.size()) + + deref(oblique_split).proj_vec_indices = self.proj_mat_indices[best_split.feature] + deref(oblique_split).proj_vec_weights = self.proj_mat_weights[best_split.feature] + with gil: + print("Finished setting everything") # Return values - deref(oblique_split).proj_vec_indices = best_split.proj_vec_indices - deref(oblique_split).proj_vec_weights = best_split.proj_vec_weights deref(oblique_split).feature = best_split.feature deref(oblique_split).pos = best_split.pos deref(oblique_split).threshold = best_split.threshold diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd index 5292551b9..4af00f445 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pxd +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pxd @@ -32,7 +32,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): SplitRecord* split_node, Node *node, intp_t node_id, - ) nogil except -1 + ) except -1 nogil cdef float32_t _compute_feature( self, const float32_t[:, :] X_ndarray, diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 5b77b9b04..2fd5ad0a7 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -110,6 +110,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): def __getstate__(self): """Getstate re-implementation, for pickling.""" d = {} + print("Setting state...") # capacity is inferred during the __setstate__ using nodes d["max_depth"] = self.max_depth d["node_count"] = self.node_count @@ -118,6 +119,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): proj_vecs = self.get_projection_matrix() d["proj_vecs"] = proj_vecs + print("Finished Setting state...") return d def __setstate__(self, d): @@ -128,7 +130,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): if "nodes" not in d: raise ValueError("You have loaded ObliqueTree version which " "cannot be imported") - + print("Getting state...") node_ndarray = d["nodes"] value_ndarray = d["values"] @@ -156,6 +158,7 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): self.proj_vec_weights[i].push_back(weight) self.proj_vec_indices[i].push_back(j) + print("Finsihed getting state...") memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), self.capacity * sizeof(Node)) memcpy(self.value, cnp.PyArray_DATA(value_ndarray), @@ -163,12 +166,14 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): cpdef cnp.ndarray get_projection_matrix(self): """Get the projection matrix of shape (node_count, n_features).""" + print("getting proj.") proj_vecs = np.zeros((self.node_count, self.n_features), dtype=np.float64) for i in range(0, self.node_count): for j in range(0, self.proj_vec_weights[i].size()): weight = self.proj_vec_weights[i][j] feat = self.proj_vec_indices[i][j] proj_vecs[i, feat] = weight + print("finished getting proj.") return proj_vecs cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil: @@ -220,17 +225,19 @@ cdef class UnsupervisedObliqueTree(UnsupervisedTree): # https://www.codementor.io/@arpitbhayani/powering-inheritance-in-c-using-structure-composition-176sygr724 cdef ObliqueSplitRecord* oblique_split_node = (split_node) node_id = self.node_count + with gil: + print("Trying to set split nodes...") + node.feature = deref(oblique_split_node).feature node.threshold = deref(oblique_split_node).threshold # oblique trees store the projection indices and weights # inside the tree itself - self.proj_vec_weights[node_id] = deref( - deref(oblique_split_node).proj_vec_weights - ) - self.proj_vec_indices[node_id] = deref( - deref(oblique_split_node).proj_vec_indices - ) + self.proj_vec_weights[node_id] = deref(oblique_split_node).proj_vec_weights + self.proj_vec_indices[node_id] = deref(oblique_split_node).proj_vec_indices + + with gil: + print("Finished setting for ", node_id) return 1 cdef float32_t _compute_feature( diff --git a/spin b/spin index 7e69cd06b..f23a70790 100755 --- a/spin +++ b/spin @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Example stub for running `python -m spin` +# Example stub for running `spin` # # Copy this into your project root. diff --git a/test_tree.py b/test_tree.py new file mode 100644 index 000000000..09160ab76 --- /dev/null +++ b/test_tree.py @@ -0,0 +1,45 @@ +import joblib +import numpy as np + +from sktree.tree import ObliqueDecisionTreeClassifier, UnsupervisedObliqueDecisionTree + +X_small = np.array( + [ + [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0], + [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1], + [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1], + [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1], + [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1], + [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1], + [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1], + [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], + [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], + [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0], + [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0], + [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0], + [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0], + [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0], + [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1], + [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1], + [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1], + [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1], + [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1], + [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1], + [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1], + [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1], + [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0], + ] +) + +y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] + +for i in range(100): + pickle_path = "./clf.joblib" + # clf = UnsupervisedObliqueDecisionTree(random_state=i) + # clf.fit(X_small) + + clf = ObliqueDecisionTreeClassifier(random_state=i) + clf.fit(X_small, y_small) + + joblib.dump(clf, pickle_path) + loaded_clf = joblib.load(pickle_path, mmap_mode="r")