sefr_binary_visiualization.py

import numpy as np

class SEFR:
    """
    This is the original binary classifier proposed in
    'SEFR: A Fast Linear-Time Classifier for Ultra-Low Power Devices' (https://arxiv.org/abs/2006.04620)
    with visiualization.
    """

    def __init__(self):
        """
        Initialize model class.
        """
        
        self.weights = []
        self.bias = 0
    
    def fit(self, data_train, target_train):
        """
        This is used for training the classifier on data.
        Parameters
        ----------
        data_train : float, either list or numpy array
            are the main data in DataFrame
        target_train : integer, numpy array
            labels, should consist of 0s and 1s
        """
        
        self.weights = []
        self.bias = 0

        data_train = np.array(data_train, dtype='float32')
        target_train = np.array(target_train, dtype='int32')
        
        # pos_labels are those records where the label is positive
        # neg_labels are those records where the label is negative
        pos_labels = (target_train > 0)   
        neg_labels = np.invert(pos_labels)

        # pos_indices are the data where the labels are positive
        # neg_indices are the data where the labels are negative
        pos_indices = data_train[pos_labels] 
        neg_indices = data_train[neg_labels]
        
        # avg_pos is the average value of each feature where the label is positive
        # avg_neg is the average value of each feature where the label is negative
        avg_pos = np.mean(pos_indices, axis=0)  # Eq. 3
        avg_neg = np.mean(neg_indices, axis=0)  # Eq. 4
        
        # weights are calculated based on Eq. 3 and Eq. 4
        self.weights = (avg_pos - avg_neg) / (avg_pos + avg_neg)  # Eq. 5
        
        # For each record, a score is calculated. If the record is positive/negative, the score will be added to posscore/negscore
        weighted_scores = np.dot(data_train, self.weights)  # Eq. 6
        
        # pos_score_avg and neg_score_avg are average values of records scores for positive and negative classes
        pos_score_avg = np.mean(weighted_scores[pos_labels])  # Eq. 7
        neg_score_avg = np.mean(weighted_scores[neg_labels])  # Eq. 8
        
        pos_label_count = pos_indices.size
        neg_label_count = neg_indices.size
        
        # bias is calculated using a weighted average
        self.bias = -(neg_label_count * pos_score_avg + pos_label_count * neg_score_avg) / (neg_label_count + pos_label_count)
    

    def predict(self, data_test):
        """
        This is for prediction. When the model is trained, it can be applied on the test data.
        Parameters
        ----------
        data_test: either list or ndarray, two dimensional
            the data without labels in
        Returns
        ----------
        predictions in numpy array
        """
        
        data_test = np.array(data_test, dtype='float32')

        weighted_score = np.dot(data_test, self.weights)
        preds = np.where(weighted_score + self.bias > 0, 1, 0)
        
        return preds


# ============================================================


from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, confusion_matrix, ConfusionMatrixDisplay, roc_curve, RocCurveDisplay, classification_report
import matplotlib.pyplot as plt


# generate random data with 2 features
X, y = make_blobs(n_samples=2500, n_features=2, centers=2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# model training and predict
clf = SEFR()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

# calculate true/false positive rate
fpr, tpr, thresholds = roc_curve(y_test, predicted)
auc_value = auc(fpr, tpr)

accuracy = accuracy_score(y_test, predicted)
print('Accuracy:', accuracy)
print(classification_report(y_test, predicted))

plt.rcParams['font.size'] = 10
fig = plt.figure(figsize=(8, 8))

# draw test data
ax = fig.add_subplot(221)
ax.set_title('Test Data')
plt.scatter(*X_test.T, c=y_test, cmap='Paired', alpha=0.8)
ax.grid(True)

# draw prediction
ax = fig.add_subplot(222)
ax.set_title(f'Predictions (accuracy: {accuracy:.3f})')
plt.scatter(*X_test.T, c=predicted, cmap='Set2', alpha=0.8)
# draw hyperplane
x1 = np.linspace(X_test.T[0].min(), X_test.T[0].max(), 2)
x2 = (-clf.bias - clf.weights[0] * x1) / clf.weights[1]  # x2 = (-b - w1x1) / w2
plt.plot(x1, x2, color='orange')
ax.grid(True)

# draw ROC/AUC
ax = fig.add_subplot(223)
ax.set_title(f'ROC AUC = {auc_value:.3f}')
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax)
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
ax.grid(True)

# draw confusion matrix
ax = fig.add_subplot(224)
ax.set_title('Confusion matrix')
cm = confusion_matrix(y_test, predicted)
ConfusionMatrixDisplay(confusion_matrix=cm).plot(ax=ax, colorbar=False, cmap='summer')

# visiualization
plt.tight_layout()
plt.show()