-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdecision_tree.py
92 lines (69 loc) · 2.81 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import ctypes
import numpy as np
import pandas as pd
import ipdb
class DecisionTree:
def __init__(self, max_depth : int = 4, min_size : int = 2):
"""
Parameters
----------
max_depth : maximum depth of the tree
min_size : minimum size of the tree
"""
self.max_depth = max_depth
self.min_size = min_size
def fit(self, X : pd.DataFrame, y : pd.Series, print_tree=False):
"""
Parameters
----------
X : Dataframe of features
y : Series of labels
"""
X = pd.concat([X, y], axis=1)
# we need to have a numpy contiguous array to pass into the C++ fit function
array = np.ascontiguousarray(X.values[:,:], np.double)
# ref : https://stackoverflow.com/questions/58727931/how-to-pass-a-2d-array-from-python-to-c
# create double pointer of type double (cast for 2d array input)
self.doublePtr = ctypes.POINTER(ctypes.c_double)
self.doublePtrPtr = ctypes.POINTER(self.doublePtr)
ct_arr = np.ctypeslib.as_ctypes(array)
doublePtrArr = self.doublePtr * ct_arr._length_
ct_ptr = ctypes.cast(doublePtrArr(*(ctypes.cast(row, self.doublePtr) for row in ct_arr)), self.doublePtrPtr)
# define C++ class initialiser outputs types
mylib.new_tree.restype = ctypes.c_void_p
# define input types
mylib.new_tree.argtypes = [ctypes.c_int, ctypes.c_int]
mylib.fit_tree.argtypes = [ctypes.c_void_p, self.doublePtrPtr, ctypes.c_int, ctypes.c_int, ctypes.c_bool]
mylib.predict.argtypes = [ctypes.c_void_p, self.doublePtrPtr, ctypes.c_int, ctypes.c_int]
self.tree_obj = mylib.new_tree(self.max_depth, self.min_size)
mylib.fit_tree(self.tree_obj, ct_ptr, array.shape[0], array.shape[1], print_tree)
def predict(self, X : pd.DataFrame) -> np.array:
"""
Parameters
----------
X : Dataframe of features
Returns
-------
predictions : ndarray of shape (n_samples, 1)
"""
array = np.ascontiguousarray(X.values[:,:], np.double)
# define predict outputs types (we need to define it every time to output the
# right number of elements we are predicting)
mylib.predict.restype = ctypes.POINTER(ctypes.c_size_t * array.shape[0])
ct_arr = np.ctypeslib.as_ctypes(array)
doublePtrArr = self.doublePtr * ct_arr._length_
ct_ptr = ctypes.cast(doublePtrArr(*(ctypes.cast(row, self.doublePtr) for row in ct_arr)), self.doublePtrPtr)
predictions = mylib.predict(self.tree_obj, ct_ptr, array.shape[0], array.shape[1])
predictions = np.array([i for i in predictions.contents])
return predictions
# find the shared library compiled after running setup.py
libfile = r"build\lib.win-amd64-3.7\decision_tree.pyd"
mylib = ctypes.CDLL(libfile)
data = pd.read_csv(r"data\setosa.txt", sep="\t")
# ipdb.set_trace()
X , y = data.iloc[:, :-1], data.iloc[:, -1]
# define tree class, fit and predict
dt = DecisionTree(4, 2)
dt.fit(X, y, True)
predictions = dt.predict(X)
print('Predictions: \n{}'.format(predictions))