Skip to content
This repository has been archived by the owner on Nov 28, 2022. It is now read-only.

Commit

Permalink
Merge pull request #50 from finos-voice/matthew/FixUpDollarFormatting
Browse files Browse the repository at this point in the history
fix up dollar formatting and print lines that fail on cleaning
  • Loading branch information
mgoldey authored May 22, 2019
2 parents 97c1c15 + eb56063 commit b18d3ad
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 10 deletions.
44 changes: 35 additions & 9 deletions asrtoolkit/clean_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def fraction_to_string(input_string):
lambda m: " ".join([digits_to_string(m.groups()[0]), m.groups()[1], "dollars"])
)
),
("dollars", (re.compile(r"\$[0-9]{1,}\.?[0-9]{0,}\w"), lambda m: dollars_to_string(m.group()))),
("dollars", (re.compile(r"\$[0-9]{1,}\.?[0-9]{0,}"), lambda m: dollars_to_string(m.group()))),
("percent", (re.compile(r"\%"), lambda m: " percent")),
("fractions", (re.compile(r"\b[0-9]\s?\/\s[0-9]\b"), lambda m: fraction_to_string(m.group()))),
("plural_numbers", (re.compile(r"\b[0-9]{1,}s\b"), lambda m: plural_numbers_to_string(m.group()))),
Expand All @@ -213,6 +213,35 @@ def fraction_to_string(input_string):
)


def remove_special_chars(line, chars_to_replace):
"remove a set of special chars"
for char_to_replace in chars_to_replace:
line = line.replace(char_to_replace, ' ')
return line


def remove_double_spaces(line):
"remove all double spaces"
while " " in line:
line = line.replace(" ", " ")
return line


def apply_all_regex_and_replacements(input_line, rematch):
"""
For a line and list of paired regex and replacements,
apply all replacements for all regex on the line
"""

for pat in rematch:
try:
input_line = re.sub(rematch[pat][0], rematch[pat][1], input_line)
except Exception as exc:
print("Exception {} with line {}".format(exc, input_line))

return input_line


def clean_up(input_line):
"""
Apply all text cleaning operations to input line
Expand Down Expand Up @@ -241,20 +270,17 @@ def clean_up(input_line):
>>> clean_up("you can reach me at 1-(317)-222-2222 or fax me at 555-555-5555")
'you can reach me at one three one seven two two two two two two two or fax me at five five five five five five five five five five'
"""
for char_to_replace in ",*&!?":
input_line = input_line.replace(char_to_replace, ' ')
input_line = remove_special_chars(input_line, ",*&!?")

for pat in rematch:
input_line = re.sub(rematch[pat][0], rematch[pat][1], input_line)
input_line = apply_all_regex_and_replacements(input_line, rematch)

for char_to_replace in ",.-":
input_line = input_line.replace(char_to_replace, ' ')
input_line = remove_special_chars(input_line, ",.-")

input_line = input_line.encode().decode('utf-8').lower()

# check for double spacing
while " " in input_line:
input_line = input_line.replace(" ", " ")
input_line = remove_double_spaces(input_line)

return input_line.strip()


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

setup(
name='asrtoolkit',
version='0.1.16',
version='0.1.17',
description=
'The GreenKey ASRToolkit provides tools for automatic speech recognition (ASR) file conversion and corpora organization.',
long_description=long_description,
Expand Down

0 comments on commit b18d3ad

Please sign in to comment.