This repository has been archived by the owner on Jul 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsefr_binary_visiualization.py
147 lines (113 loc) · 5.06 KB
/
sefr_binary_visiualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import numpy as np
class SEFR:
"""
This is the original binary classifier proposed in
'SEFR: A Fast Linear-Time Classifier for Ultra-Low Power Devices' (https://arxiv.org/abs/2006.04620)
with visiualization.
"""
def __init__(self):
"""
Initialize model class.
"""
self.weights = []
self.bias = 0
def fit(self, data_train, target_train):
"""
This is used for training the classifier on data.
Parameters
----------
data_train : float, either list or numpy array
are the main data in DataFrame
target_train : integer, numpy array
labels, should consist of 0s and 1s
"""
self.weights = []
self.bias = 0
data_train = np.array(data_train, dtype='float32')
target_train = np.array(target_train, dtype='int32')
# pos_labels are those records where the label is positive
# neg_labels are those records where the label is negative
pos_labels = (target_train > 0)
neg_labels = np.invert(pos_labels)
# pos_indices are the data where the labels are positive
# neg_indices are the data where the labels are negative
pos_indices = data_train[pos_labels]
neg_indices = data_train[neg_labels]
# avg_pos is the average value of each feature where the label is positive
# avg_neg is the average value of each feature where the label is negative
avg_pos = np.mean(pos_indices, axis=0) # Eq. 3
avg_neg = np.mean(neg_indices, axis=0) # Eq. 4
# weights are calculated based on Eq. 3 and Eq. 4
self.weights = (avg_pos - avg_neg) / (avg_pos + avg_neg) # Eq. 5
# For each record, a score is calculated. If the record is positive/negative, the score will be added to posscore/negscore
weighted_scores = np.dot(data_train, self.weights) # Eq. 6
# pos_score_avg and neg_score_avg are average values of records scores for positive and negative classes
pos_score_avg = np.mean(weighted_scores[pos_labels]) # Eq. 7
neg_score_avg = np.mean(weighted_scores[neg_labels]) # Eq. 8
pos_label_count = pos_indices.size
neg_label_count = neg_indices.size
# bias is calculated using a weighted average
self.bias = -(neg_label_count * pos_score_avg + pos_label_count * neg_score_avg) / (neg_label_count + pos_label_count)
def predict(self, data_test):
"""
This is for prediction. When the model is trained, it can be applied on the test data.
Parameters
----------
data_test: either list or ndarray, two dimensional
the data without labels in
Returns
----------
predictions in numpy array
"""
data_test = np.array(data_test, dtype='float32')
weighted_score = np.dot(data_test, self.weights)
preds = np.where(weighted_score + self.bias > 0, 1, 0)
return preds
# ============================================================
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay, classification_report
import matplotlib.pyplot as plt
# generate random data with 2 features
X, y = make_blobs(n_samples=2500, n_features=2, centers=2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# model training and predict
clf = SEFR()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
# calculate true/false positive rate
fpr, tpr, thresholds = roc_curve(y_test, predicted)
auc_value = auc(fpr, tpr)
accuracy = accuracy_score(y_test, predicted)
print('Accuracy:', accuracy)
print(classification_report(y_test, predicted))
plt.rcParams['font.size'] = 10
fig = plt.figure(figsize=(8, 8))
# draw test data
ax = fig.add_subplot(221)
ax.set_title('Test Data')
plt.scatter(*X_test.T, c=y_test, cmap='Paired', alpha=0.8)
ax.grid(True)
# draw prediction
ax = fig.add_subplot(222)
ax.set_title(f'Predictions (accuracy: {accuracy:.3f})')
plt.scatter(*X_test.T, c=predicted, cmap='Set2', alpha=0.8)
# draw hyperplane
x1 = np.linspace(X_test.T[0].min(), X_test.T[0].max(), 2)
x2 = (-clf.bias - clf.weights[0] * x1) / clf.weights[1] # x2 = (-b - w1x1) / w2
plt.plot(x1, x2, color='orange')
ax.grid(True)
# draw ROC/AUC
ax = fig.add_subplot(223)
ax.set_title(f'ROC AUC = {auc_value:.3f}')
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax)
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax.grid(True)
# draw confusion matrix
ax = fig.add_subplot(224)
ax.set_title('Confusion matrix')
cm = confusion_matrix(y_test, predicted)
ConfusionMatrixDisplay(confusion_matrix=cm).plot(ax=ax, colorbar=False, cmap='summer')
# visiualization
plt.tight_layout()
plt.show()