-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplify_dataset.py
128 lines (109 loc) · 4.75 KB
/
simplify_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf-8 -*-
__version__ = '1.0'
__author__ = 'Wawrzyński Adam, Szypryt Kamil'
import os
import sys
import argparse
def process_file(filename):
"""Simplify TIMIT .PHN file from 61 to 39 phonemes."""
with open(filename, "rt") as fin:
with open(filename+"tmp", "wt") as fout:
for line in fin:
if 'ao' in line:
fout.write(line.replace('ao', 'aa'))
elif 'ax' in line:
fout.write(line.replace('ax', 'ah'))
elif 'ax-h' in line:
fout.write(line.replace('ax-h', 'ah'))
elif 'ah-h' in line:
fout.write(line.replace('ah-h', 'ah'))
elif 'ahr' in line:
fout.write(line.replace('ahr', 'er'))
elif 'axr' in line:
fout.write(line.replace('axr', 'er'))
elif 'hv' in line:
fout.write(line.replace('hv', 'hh'))
elif 'ix' in line:
fout.write(line.replace('ix', 'ih'))
elif 'el' in line:
fout.write(line.replace('el', 'l'))
elif 'em' in line:
fout.write(line.replace('em', 'm'))
elif 'en' in line:
fout.write(line.replace('en', 'n'))
elif 'nx' in line:
fout.write(line.replace('nx', 'n'))
elif 'eng' in line:
fout.write(line.replace('eng', 'ng'))
elif 'zh' in line:
fout.write(line.replace('zh', 'sh'))
elif 'ux' in line:
fout.write(line.replace('ux', 'uw'))
elif 'q' in line:
fout.write(line.replace('q', 'sil'))
elif 'pcl' in line:
fout.write(line.replace('pcl', 'sil'))
elif 'tcl' in line:
fout.write(line.replace('tcl', 'sil'))
elif 'kcl' in line:
fout.write(line.replace('kcl', 'sil'))
elif 'bcl' in line:
fout.write(line.replace('bcl', 'sil'))
elif 'dcl' in line:
fout.write(line.replace('dcl', 'sil'))
elif 'gcl' in line:
fout.write(line.replace('gcl', 'sil'))
elif 'h#' in line:
fout.write(line.replace('h#', 'sil'))
elif 'pau' in line:
fout.write(line.replace('pau', 'sil'))
elif 'epi' in line:
fout.write(line.replace('epi', 'sil'))
else:
fout.write(line)
os.remove(filename)
os.rename(filename+"tmp", filename)
def simplify_phonemes_file(filename):
"""Simplify TIMIT alphabet file from 61 to 39 phonemes."""
with open(filename+"tmp", "wt") as fout:
fout.write("sh\ng\nl\nah\ndx\nch\nae\nz\nn\ny\neh\nsil\naa\nih\n"+
"k\nth\naw\nb\nf\noy\nd\nay\nw\now\np\nt\ner\njh\ns\ney\n"+
"ng\nuw\nr\niy\nv\nm\nuh\nhh\ndh\n")
os.remove(filename)
os.rename(filename+"tmp", filename)
def simplify_dataset(path):
"""Simplify TIMIT dataset from 61 to 39 phonemes."""
# extract basename of files and remove duplicates
filelist = os.listdir(path)
for i in range(0, len(filelist)):
filelist[i] = os.path.splitext(os.path.basename(filelist[i]))[0]
filelist = list(dict.fromkeys(filelist))
# process files
for filename in filelist:
# get list of directories
files = os.listdir(path)
files.sort()
# if path points to directory do recursive call
if(os.path.isdir(path + '/' + filename)):
simplify_dataset(path + '/' + filename)
# otherwise process files inside directory
else:
process_file(path + '/' + filename + ".PHN")
if __name__ == "__main__":
# initiate the parser
parser = argparse.ArgumentParser()
# add long and short argument
parser.add_argument("--dataset",
"-d",
help="path to dataset root directory",
dest="dataset",
required=True)
parser.add_argument("--phonemes",
"-p",
help="path to alphabet file",
dest="phonemes",
required=True)
# read arguments from the command line
args = parser.parse_args()
simplify_dataset(args.dataset)
simplify_phonemes_file(args.phonemes)