-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_preprocess.py
119 lines (110 loc) · 4.42 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import csv
import random
import math
import numpy as np
np.set_printoptions(precision=2)
X=np.zeros((500,9))
def classifier(threshold,fileName,normalizedFile):
class_new_data=csv.writer(open("Classified_Admission.csv","w",newline=""))
reg_new_data=csv.writer(open(normalizedFile,"w",newline=""))
old_data=list(csv.reader(open(fileName,"r")))
data=normalize(old_data)
for row in data:
reg_new_data.writerow(np.around(row,2))
if float(row[8])<threshold:
row[8]=(0)
else:
row[8]=(+1)
class_new_data.writerow(np.around(row,2))
return
def split_data(test_data_size,reg_train,class_train,reg_test,class_test):
# initialization of writer objects
class_test_data=csv.writer(open(class_test,"w",newline=""))
reg_test_data=csv.writer(open(reg_test,"w",newline=""))
reg_train_data=csv.writer(open(reg_train,"w",newline=""))
class_train_data=csv.writer(open(class_train,"w",newline=""))
reg_data=list(csv.reader(open("Regression_Admission.csv","r")))
class_data=list(csv.reader(open("Classified_Admission.csv","r")))
# shuffling the data and
random.shuffle(class_data)
random.shuffle(reg_data)
for i in range(501-test_data_size):
reg_train_data.writerow(reg_data[i])
class_train_data.writerow(class_data[i])
pass
for i in range(500-test_data_size,500):
class_test_data.writerow(class_data[i])
reg_test_data.writerow(reg_data[i])
return
# making mean of the dataset equal to zero
def normalize(old_data):
m=len(old_data)
n=len(old_data[0])
means=np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
variances=np.array([1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0])
# normalization over samples
for j in range(n):
arr=[rows[j] for rows in old_data]
means[j]=np.mean( np.array(arr).astype(np.float))
# loop
for i in range(m):
X[i][j]=float(old_data[i][j])
# making mean equal to zero
if j<8 and j>0:
X[i][j]=X[i][j]-means[j]
variances[j]=variances[j]+(X[i][j]*X[i][j])
# We now need to divide it to standard deviations
for j in range(n):
variances[j]=math.sqrt(variances[j]/(m-1))
for j in range(m):
# division by std if feature
if j<8 and j>0:
X[i][j]=X[i][j]/variances[j]
#print("Variance values are",variances)
return X
# principal component analysis
def PCA(fileName='Regression_Admission.csv'):
# the dataset is already normalized. We directly compute its SVD
singular_values=np.linalg.svd(X, full_matrices=False, compute_uv=False)
principal_components=np.multiply(singular_values,singular_values)
print('Lengths of the principal components are '+str(principal_components)+'\n')
return
# extracting the input vector(predictors) from the data set
def getInputSamples(data,input_data_file):
#data = list(csv.reader(open(data_file,'r')))
input_data = csv.writer(open(input_data_file,"w",newline=""))
for sample in data:
x =[]
for i in range(1,len(sample)-1):
x.append(sample[i])
input_data.writerow(x)
return
# extracting the output vector(labels) from the data set
def getOutputValues(data,output_data_file):
#data = list(csv.reader(open(data_file,'r')))
output_data = csv.writer(open(output_data_file,"w",newline=""))
for samples in data:
x =[]
x.append(samples[len(samples)-1])
output_data.writerow(x)
return
# given a data set calculates both the input and output vectors for classification
def calculateInputOutput(class_data):
class_input="Input_Train_Data.csv"
class_output="Output_Train_Data.csv"
getInputSamples(class_data,class_input)
getOutputValues(class_data,class_output)
a = list(csv.reader(open(class_input,'r')))
b = list(csv.reader(open(class_output,'r')))
X=np.array(a,dtype=float)
y=np.array(b,dtype=float)
return [X,y]
# converts the 1 and 0 labled data to 1 and -1 labeled data for SVM
def convertSVM(data_list):
new_data_list = data_list[:]
for row in new_data_list:
#i = new_data_list.index(row)
if row[len(row)-1] == "0.0":
row[len(row)-1] = "-1.0"
#new_data_list[i] = row
return new_data_list