Merge pull request #30 from finos-voice/matthew/tweakCleanUp

improve clean_formatting routines
finos · Jan 11, 2019 · 0b61bb4 · 0b61bb4
2 parents 9e1c238 + 69ff78d
commit 0b61bb4
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 4 deletions.
diff --git a/asrtoolkit/clean_formatting.py b/asrtoolkit/clean_formatting.py
@@ -242,7 +242,7 @@ def clean_up(input_line):
     'you can reach me at one three one seven two two two two two two two or fax me at five five five five five five five five five five'
   """
   for char_to_replace in ",*&!?":
-    input_line = input_line.replace(char_to_replace, '')
+    input_line = input_line.replace(char_to_replace, ' ')
 
   for pat in rematch:
     input_line = re.sub(rematch[pat][0], rematch[pat][1], input_line)
@@ -264,14 +264,14 @@ def clean_text_file(input_text_file):
   """
 
   with open(input_text_file, 'r', encoding='utf-8') as f:
-    lines = f.readlines()
+    lines = f.read().splitlines()
 
   cleaned = []
   for line in lines:
     cleaned.append(clean_up(line))
 
   with open(input_text_file.replace('.txt', '') + '_cleaned.txt', 'w', encoding='utf-8') as f:
-    f.writelines(cleaned)
+    f.write(" ".join(cleaned))
 
   print('File output: ' + input_text_file.replace('.txt', '') + '_cleaned.txt')
 

diff --git a/asrtoolkit/wer.py b/asrtoolkit/wer.py
@@ -10,6 +10,9 @@
 from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 from asrtoolkit.clean_formatting import clean_up
 
+# defines global regex for tagged noises
+re_tagged_noises = re.compile(r"[\[<][A-Za-z ]*[\]>]")
+
 # defines global regex to remove these nsns
 nonsilence_noises = ["noise", "um", "ah", "er", "umm", "uh", "mm", "mn", "mhm", "mnh"]
 re_nonsilence_noises = re.compile(r"\b({})\b".format("|".join(nonsilence_noises)))
@@ -36,10 +39,16 @@ def wer(ref, hyp, remove_nsns=False):
   if type(hyp) == time_aligned_text:
     hyp = hyp.text()
 
+  # remove tagged noises
+  ref = re.sub(re_tagged_noises, ' ', ref)
+  hyp = re.sub(re_tagged_noises, ' ', hyp)
+
+  # optionally, remove non silence noises
   if remove_nsns:
     ref = remove_nonsilence_noises(ref)
     hyp = remove_nonsilence_noises(hyp)
 
+  # clean punctuation, etc.
   ref = clean_up(ref)
   hyp = clean_up(hyp)
 

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
   name='asrtoolkit',
-  version='0.1.11',
+  version='0.1.12',
   description=
   'The GreenKey ASRToolkit provides tools for automatic speech recognition (ASR) file conversion and corpora organization.',
   long_description=long_description,