-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathleave_one_out.py
162 lines (148 loc) · 5.74 KB
/
leave_one_out.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!usr/bin/python
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import elm
import scipy as sk
import numpy as np
from sklearn.utils.testing import assert_greater, assert_raise_message,assert_allclose
import matplotlib.patches as mpatches
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from pandas.tools.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.utils.estimator_checks import check_estimator
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_curve, auc
from sklearn import svm, datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
#GMLVQ , www.cs.rug.nl/~biehl
#forest, DTC
def readFeaturesFile():
names = ['Feature1', 'Feature2', 'Feature3', 'Feature4','Feature5','Feature6','Feature7','Feature8','Feature9',
'Feature10','Feature11','Feature12','Feature13','Gender']
data = pd.read_csv("mfcc_featuresLR.txt",names=names )
#the outcome is a list of lists containing the samples with the following format
#[charachteristic,feature1,feature2.......,feature13]
#characheristic based on what we want for classification , can be (male , female) , also can be (normal-female,edema-female)
#in general characheristic is the target value .
training(data)
#visualizeData(data)
#Implementation in order to measure algorithms' execution time
def compare_Algorithms(model,X_train,Y_train,kfold,scoring):
#return the validation score for every algorithm
return model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
def preparingData(data):
# Split-out validation dataset
array = data.values
#input
X = array[:,0:13]
#target
Y = array[:,13]
return X,Y
def training(data):
models = []
#ML algorithms
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
names = []
resultsLOO = []
results = []
meanResultsLOO=np.zeros(len(models))
meanTimes=np.zeros(len(models))
meanResults=np.zeros(len(models))
n = int(raw_input('How many times you want to run the procedure? '))
for x in range(n):
X,Y = preparingData(data)
#splitting into training and testing
validation_size = 0.20
#test_size is the splitting between the training and testing data , for example 20% testing and 80% training in our case
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size)
scoring = 'accuracy'
#testing
x=0
print'\n'
print('Algorithm: accuracy(k-fold) accuracy(leave-one-out) time')
for name, model in models:
#function for k-flods cross validation , k-folds = n_splits
#then i can determine if i want to shuffle my data every time before the validation True
#and random_state , is the seed for the random number generator
kfold = model_selection.KFold(n_splits=10, shuffle=True)
#leave-one-out validation with no arguments
leaveOneOut = model_selection.LeaveOneOut()
import time
#initialize the time
start_time = time.time()
cv_results = compare_Algorithms(model,X_train,Y_train,kfold,scoring)
#compare algorithms with leave-one-out validation
cv_resultsLOO = compare_Algorithms(model,X_train,Y_train,leaveOneOut,scoring)
#count the time around the command
time = time.time() - start_time
#append the validation of every algorithm
resultsLOO.append(cv_resultsLOO)
results.append(cv_results)
names.append(name)
#visualize results
msg = "%s: %f %f %f" % (name, cv_results.mean(),cv_resultsLOO.mean(),time)
meanTimes[x]+=time
meanResultsLOO[x]+=cv_resultsLOO.mean()
meanResults[x]+=cv_results.mean()
if(x<len(models)-1):
x+=1
else:
x=0
print msg
#divide the outcomes with the number of iterations to take the mean of all the iterations
meanTimes = (meanTimes / n)
meanResults = (meanResults / n)
meanResultsLOO = (meanResultsLOO / n)
print'\n\n'
print('Mean of every iteration:')
print('Algorithm: accuracy(k=10-fold) accuracy(leave-one-out) time')
for x in range(len(models)):
msg = "%s: %f %f %f" % (names[x],meanResults[x],meanResultsLOO[x],meanTimes[x])
print(msg)
def visualizeData(data):
#Checking my data
#data shape
print(data.shape)
#print the 20 first samples
print(data.head(20))
#This includes the count, mean, the min and max values as well as some percentiles
print(data.describe())
#class distribution
print(data.groupby('Gender').size())
#Visualize my data
# box and whisker plots
data.plot(kind='box', subplots=True, sharex=False, sharey=False)
# box and whisker plots
data.plot(kind='box', subplots=True, sharex=False, sharey=False)
# scatter plot matrix
scatter_matrix(data)
plt.show()
def main():
readFeaturesFile()
main()