-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprocess_retained.py
135 lines (114 loc) · 5.63 KB
/
process_retained.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Performs various post-processing operations of the "retain.tsv" file and associated wav files.
# * Deletes single misrecognition lines from the TSV file.
# * Deletes ranges of lines between "begin_misrecognition_mode" until "misrecognition" from the TSV file.
# * Deletes NoiseRule lines from the TSV file.
# * Deletes the most recent actual recognition, everytime it sees a "misrecognition" tag in a NoiseRule. Allows user to tag a misrecognition even if there's noise utterances in between the misrecognition and the tag.
# * Deletes all wav files from filesystem that aren't listed in the processed TSV file.
# By Shervin Emami ([email protected]), 2020.
import time
import os
inFilename = "retained/retain.tsv"
outFilename = "retained/processed.tsv"
# Convert filename such as "retained/retain_2020-05-05_18-05-36_665975.wav" into a timestamp
def getSecondsSince2000(filename):
index = 16 # Position of the year string within the filename string
year = int(filename[index:index+4])
index += 5
month = int(filename[index:index+2])
index += 3
day = int(filename[index:index+2])
index += 3
hours = int(filename[index:index+2])
index += 3
minutes = int(filename[index:index+2])
index += 3
seconds = int(filename[index:index+2])
index += 3
days_since_2000 = (year-2000) * 365 + month * 30 + day # Each month is roughly 30 days right :-)
seconds_since_2000 = days_since_2000 * (24*60*60) + hours * (60*60) + minutes * (60) + seconds
return seconds_since_2000
def removeBadFilesFromTSV():
print("Deleting files that are misrecognitions ...")
total = 0
num_recognitions = 0
list_of_filenames = []
with open(inFilename, "r") as f:
data = f.readlines()
outputList = []
removeNextLines = False
for line in data:
words = line.split('\t')
print(words)
filename = words[0]
duration = words[1]
ruleType = words[3]
phrase = words[4]
tag = words[6]
# Convert filename such as "retained/retain_2020-05-05_18-05-36_665975.wav" into a timestamp
seconds_timestamp = getSecondsSince2000(filename)
# Ignore the grammars we don't want, including Dragon dictation in Shervin's custom dual microphone mode
removeThisLine = False
if (ruleType == "IgnoredRule" or ruleType == "NoiseRule"):
removeThisLine = True
# Ignore lines with "misrecognition" tag
if (tag == "misrecognition"):
removeThisLine = True
# Remove lines marked as bad, and everything between "begin_misrecognition_mode" until "misrecognition".
if (removeNextLines or removeThisLine):
print(" Deleting ", words)
try:
os.remove(filename)
except:
pass
else:
# Process the line
duration_f = float(duration)
total = total + duration_f
num_recognitions = num_recognitions + 1
# Remember the line in the output file, so we can write it into the TSV file later
outputList.append(line)
# Remember the filename
list_of_filenames.append(filename)
# If we have a misrecognition tag on a NoiseRule, then delete the most recent proper line, not just the noise line. Instead of searching backwards for the most recent
# non-NoiseRule line, we can simply delete the latest outputList entry
if (tag == "misrecognition" and ruleType == "NoiseRule"):
outputList.pop()
# Check if we should change modes
if (tag == "begin_misrecognition_mode"):
# Delete all the next lines
removeNextLines = True
removeNextLines_timestamp = seconds_timestamp
removeNextLines_filename = filename
if (tag == "misrecognition"):
# Stop deleting the next lines
removeNextLines = False
if removeNextLines:
# Put a timeout on misrecognition ranges
if seconds_timestamp > removeNextLines_timestamp + 60:
print("WARNING: begin_misrecognition_mode of", removeNextLines_filename, "has been running for", (seconds_timestamp - removeNextLines_timestamp), "seconds! Assuming the closing misrecognition tag was missing and thus accepting recognitions from here on.")
# Stop deleting the next lines
removeNextLines = False
# Store the actual output file
with open(outFilename, "w") as wf:
for line in outputList:
wf.write(line)
return num_recognitions, total, list_of_filenames
def removeFilesInFolderThatArentInTSV(list_of_filenames):
import glob
#print(list_of_filenames)
print("Deleting files in 'retained' folder that aren't in the TSV file of", len(list_of_filenames), "names ...")
for filename in glob.glob("retained/retain_*.wav"):
if filename not in list_of_filenames:
print(" Deleting unused file", filename)
try:
os.remove(filename)
except:
pass
num_recognitions, total, list_of_filenames = removeBadFilesFromTSV()
print()
removeFilesInFolderThatArentInTSV(list_of_filenames)
time_str = time.strftime('%H hours, %M mins, %S seconds', time.gmtime(total))
print()
print(int(total), "seconds across", num_recognitions, "utterances means", time_str)
print("Stored processed output file as '" + outFilename + "'")
print()