-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkNN.py
91 lines (67 loc) · 3.56 KB
/
kNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from DataGenerator import generateData
from Preprocessing import transfromFeaturesToNoiseRandomly
from Filtering import fisherScoreFilter
from settings import (NUMBER_OF_CLASSES, NUMBER_OF_FEATURES, NUMBER_OF_RECORDS_PER_CLASS,
FEATURE_MEAN_RANGE, RANDOM_NUMBER_SEED, NUMBER_OF_FEATURES_TO_PRUNE,
TEST_SIZE_PERCENTAGE, NOISE_MEAN, NOISE_STD, constantFilterThreshold,
correlationFilterThreshold, maxNumberOfFeaturesToRemove)
nRuns = 20
randomSeeds = range(0, nRuns)
accuracy = np.zeros((maxNumberOfFeaturesToRemove, nRuns))
k = int(sys.argv[1])
for r in randomSeeds:
data, labels = generateData(NUMBER_OF_CLASSES, NUMBER_OF_FEATURES,
NUMBER_OF_RECORDS_PER_CLASS, FEATURE_MEAN_RANGE, r)
prunedTrainData = transfromFeaturesToNoiseRandomly(data, labels, NUMBER_OF_FEATURES_TO_PRUNE, NOISE_MEAN, NOISE_STD, r)
X_train, X_test, y_train, y_test = train_test_split(prunedTrainData, labels,
test_size=TEST_SIZE_PERCENTAGE)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy[0, r] = accuracy_score(y_test, y_pred)
for i in range(1, maxNumberOfFeaturesToRemove):
X_train_i, fisherScores, removedFeatures = fisherScoreFilter(i, X_train, y_train, NUMBER_OF_CLASSES)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train_i, y_train)
X_test_i = np.delete(X_test, np.s_[removedFeatures], axis=1)
y_pred = classifier.predict(X_test_i)
accuracy[i, r] = accuracy_score(y_test, y_pred)
averageAccuracy = np.mean(accuracy, axis=1)
print(accuracy[8, :])
accuracy_rand = np.zeros((maxNumberOfFeaturesToRemove, nRuns))
for r in randomSeeds:
data, labels = generateData(NUMBER_OF_CLASSES, NUMBER_OF_FEATURES,
NUMBER_OF_RECORDS_PER_CLASS, FEATURE_MEAN_RANGE, r)
prunedTrainData = transfromFeaturesToNoiseRandomly(data, labels, NUMBER_OF_FEATURES_TO_PRUNE, NOISE_MEAN, NOISE_STD, r)
X_train, X_test, y_train, y_test = train_test_split(prunedTrainData, labels,
test_size=TEST_SIZE_PERCENTAGE)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_rand[0, r] = accuracy_score(y_test, y_pred)
for i in range(1, maxNumberOfFeaturesToRemove):
removedFeatures = np.random.choice(12, i, replace=False)
X_train_i = np.delete(X_train, np.s_[removedFeatures], axis=1)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train_i, y_train)
X_test_i = np.delete(X_test, np.s_[removedFeatures], axis=1)
y_pred = classifier.predict(X_test_i)
accuracy_rand[i, r] = accuracy_score(y_test, y_pred)
averageAccuracy_rand = np.mean(accuracy_rand, axis=1)
plt.figure()
plt.plot(averageAccuracy, label='Fisher score')
plt.plot(averageAccuracy_rand, label='Random filtering')
plt.xlabel('Number of features removed')
plt.ylabel('Accuracy')
plt.title('Fisher score VS random filtering')
plt.legend()
plt.grid()
plt.show()