-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPCA_tSNE_analysis.py
111 lines (94 loc) · 3.68 KB
/
PCA_tSNE_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
l= np.arange(300, 493, 1)
l=np.delete(l, [18, 21, 41, 42, 62, 94, 98, 151, 158, 160, 180])
path1=[]
path2=[]
#path3=[]
ffeatures=[]
foutput=[]
outputs=pd.read_csv('PHQ_Patient.csv', sep=',', header=None)
outputs=outputs.iloc[:,1]
output=np.asarray(outputs)
for i in range(len(l)):
s1= 'C:\\Users\\Prajwal\\Downloads\\transcript\\transcript\\' + str(l[i]) + '_TRANSCRIPT.csv'
path1.append(s1)
s2= 'C:\\Users\\Prajwal\\Downloads\\wwwdaicwoz\\wwwdaicwoz\\' + str(l[i]) + '_P\\' + str(l[i]) +'_COVAREP.csv'
path2.append(s2)
#s3= 'C:\\Users\\Prajwal\\Downloads\\wwwdaicwoz\\wwwdaicwoz\\' + str(l[i]) + '_P\\' + str(l[i]) +'_FORMANT.csv'
#path3.append(s3)
for j in range(len(path1)):
frames=pd.read_csv(path1[j], sep='\t')
features= pd.read_csv(path2[j], sep= ',', header= None)
#formant= pd.read_csv(path3[j], sep= ',', header= None)
frames1=frames[frames['speaker'].str.match('Participant')]
frames2= frames1.iloc[:,0:2]
frames3= frames2.values
frames4= frames3[:,:]*100
c=[]
#d=[]
for i in range(len(frames4)):
start_frame= int(frames4[i,0])
stop_frame= int(frames4[i,1])
a=features.iloc[start_frame:stop_frame, :]
#b=formant.iloc[start_frame:stop_frame, :]
c.append(a)
#d.append(b)
#name1= str(l[j])+ '_final_features'
#name2= str(l[j]) + '_final_formants'
final_features=pd.concat(c)
ffeatures.append(final_features)
arr_ft=np.full([len(final_features), 1], output[j])
foutput.append(arr_ft)
ffeatures=pd.concat(ffeatures)
foutput=np.concatenate(foutput)
foutput1=foutput.astype(int)
for k in range(len(foutput)):
if(foutput[k][0]>10):
foutput1[k][0]= 1
else:
foutput1[k][0]= 0
#foutput=np.ravel(foutput)
ffeatures= StandardScaler().fit_transform(ffeatures)
ffeatures= pd.DataFrame(data=ffeatures)
target1=pd.DataFrame(data=foutput1, columns=['Target'])
finalDf = pd.concat([ffeatures, target1], axis = 1)
depressed= finalDf[(finalDf['Target']==1)]
control= finalDf[(finalDf['Target']==0)]
depressed1=depressed.sample(n=5000)
control1= control.sample(n=5000)
finalDF=pd.concat([depressed1,control1], axis=0)
finalDF1=finalDF.drop(['Target'], axis=1)
#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(ffeatures)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
#tSNE
import time
from sklearn.manifold import TSNE
time_start = time.time()
tsne = TSNE()
tsne_results = tsne.fit_transform(finalDF1)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
tsne_res=pd.DataFrame(data=tsne_results, columns=['principal component 1', 'principal component 2'])
target2=np.asarray(finalDF['Target'])
target2=pd.DataFrame(data=target2, columns=['Target'])
finDf=pd.concat([tsne_res, target2 ], axis=1)
# Scatter plot
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('x-tSNE', fontsize = 15)
ax.set_ylabel('y-tSNE', fontsize = 15)
ax.set_title('tSNE plot', fontsize = 20)
targets = ['depressed', 'control']
colors = ['r', 'b']
for target, color in zip(targets,colors):
indicesToKeep = finDf['Target'] == target
ax.scatter(finDf.loc[indicesToKeep, 'principal component 1'], finDf.loc[indicesToKeep, 'principal component 2'], c = color)
ax.legend(targets)
ax.grid()