Skip to content
This repository has been archived by the owner on Nov 28, 2022. It is now read-only.

Commit

Permalink
Merge pull request #30 from finos-voice/matthew/tweakCleanUp
Browse files Browse the repository at this point in the history
improve clean_formatting routines
  • Loading branch information
mgoldey authored Jan 11, 2019
2 parents 9e1c238 + 69ff78d commit 0b61bb4
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 4 deletions.
6 changes: 3 additions & 3 deletions asrtoolkit/clean_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def clean_up(input_line):
'you can reach me at one three one seven two two two two two two two or fax me at five five five five five five five five five five'
"""
for char_to_replace in ",*&!?":
input_line = input_line.replace(char_to_replace, '')
input_line = input_line.replace(char_to_replace, ' ')

for pat in rematch:
input_line = re.sub(rematch[pat][0], rematch[pat][1], input_line)
Expand All @@ -264,14 +264,14 @@ def clean_text_file(input_text_file):
"""

with open(input_text_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
lines = f.read().splitlines()

cleaned = []
for line in lines:
cleaned.append(clean_up(line))

with open(input_text_file.replace('.txt', '') + '_cleaned.txt', 'w', encoding='utf-8') as f:
f.writelines(cleaned)
f.write(" ".join(cleaned))

print('File output: ' + input_text_file.replace('.txt', '') + '_cleaned.txt')

Expand Down
9 changes: 9 additions & 0 deletions asrtoolkit/wer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
from asrtoolkit.clean_formatting import clean_up

# defines global regex for tagged noises
re_tagged_noises = re.compile(r"[\[<][A-Za-z ]*[\]>]")

# defines global regex to remove these nsns
nonsilence_noises = ["noise", "um", "ah", "er", "umm", "uh", "mm", "mn", "mhm", "mnh"]
re_nonsilence_noises = re.compile(r"\b({})\b".format("|".join(nonsilence_noises)))
Expand All @@ -36,10 +39,16 @@ def wer(ref, hyp, remove_nsns=False):
if type(hyp) == time_aligned_text:
hyp = hyp.text()

# remove tagged noises
ref = re.sub(re_tagged_noises, ' ', ref)
hyp = re.sub(re_tagged_noises, ' ', hyp)

# optionally, remove non silence noises
if remove_nsns:
ref = remove_nonsilence_noises(ref)
hyp = remove_nonsilence_noises(hyp)

# clean punctuation, etc.
ref = clean_up(ref)
hyp = clean_up(hyp)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

setup(
name='asrtoolkit',
version='0.1.11',
version='0.1.12',
description=
'The GreenKey ASRToolkit provides tools for automatic speech recognition (ASR) file conversion and corpora organization.',
long_description=long_description,
Expand Down

0 comments on commit 0b61bb4

Please sign in to comment.