-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing.py
119 lines (96 loc) · 4.04 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
import argparse
import json
import os
from audio_utils import wav_to_feature
def create_coswara_json(coswara_dir, coswara_metadata):
# every folder here represents a date we need
coswara_folders = [f for f in os.listdir(coswara_dir) if os.path.isdir(coswara_dir+f) and f[0]=='2']
datestrings = {}
for folder in coswara_folders:
for uid in os.listdir(coswara_dir+'/'+folder):
datestrings[uid]=folder
coswara_json = {}
p_count = 0
n_count = 0
u_count = 0
pd_count = 0
nd_count = 0
ud_count = 0
for idx, row in coswara_metadata.iterrows():
# get patient info
# Note that we only need sound, patient_id and their pcr_test results
# for our experiments
patient_info = {}
patient_folder = os.path.join(coswara_dir, datestrings[row['id']], row['id'])
wav_paths = [os.path.join(patient_folder, f) for f in os.listdir(patient_folder) if '.wav' in f and f[:2] != "._"]
patient_info['feature_paths'] = wav_to_feature(wav_paths, store=True)
num_of_paths = len(patient_info['feature_paths'])
if num_of_paths < 5:
continue
status= row['covid_status']
if status in {'positive_mild', 'positive_moderate', 'positive_asymp'}:
patient_info['pcr_test_result'] = 'positive'
p_count += 1
pd_count += num_of_paths
elif status in {'healthy'}:
patient_info['pcr_test_result'] = 'negative'
n_count += 1
nd_count += num_of_paths
else:
patient_info['pcr_test_result'] = 'untested'
u_count += 1
ud_count += num_of_paths
coswara_json[row['id']] = patient_info
print(p_count, n_count, u_count)
print(pd_count, nd_count, ud_count)
return coswara_json
def create_coughvid_json(coughvid_dir, coughvid_metadata, threshold=0.7):
coughvid_json = {}
for idx, row in coughvid_metadata.iterrows():
if row['cough_detected'] < threshold:
continue
ID = row['patient_id']
if ID not in coughvid_json:
patient_info = {}
patient_info['pcr_test_result'] = row['pcr_test_result_inferred']
feature_paths = wav_to_feature([os.path.join(coughvid_dir, row['cough_path'])])
if len(feature_paths) == 0:
continue
else:
patient_info['feature_paths'] = feature_paths
coughvid_json[ID] = patient_info
else:
print('warning: detect the same speaker: {}!'.format(ID))
assert (coughvid_json[ID]['pcr_test_result'] == row['pcr_test_result'])
feature_paths = wav_to_feature([os.path.join(coughvid_dir, row['cough_path'])])
if len(feature_paths) == 0:
continue
else:
coughvid_json[ID]['feature_paths'] += feature_paths
print(len(coughvid_json))
return coughvid_json
def preprocess_coswara():
coswara_dir = "datasets/Coswara-Data/"
coswara_metadata = pd.read_csv(coswara_dir+'combined_data.csv')
coswara_json = create_coswara_json(coswara_dir, coswara_metadata)
coswara_json = json.dumps(coswara_json, indent=4)
with open("datasets/coswara.json", "w") as outfile:
outfile.write(coswara_json)
def preprocess_coughvid():
coughvid_dir = 'datasets/virufy-cdf-coughvid'
coughvid_metadata = pd.read_csv(os.path.join(coughvid_dir, 'virufy-cdf-coughvid.csv'))
coughvid_json = create_coughvid_json(coughvid_dir, coughvid_metadata)
coughvid_json = json.dumps(coughvid_json, indent=4)
with open('datasets/coughvid.json', 'w') as outfile:
outfile.write(coughvid_json)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dataset", type=str, default="coswara",
choices=["coswara", "coughvid"]);
args = parser.parse_args()
if args.dataset == "coswara":
preprocess_coswara()
elif args.dataset == 'coughvid':
preprocess_coughvid()