Skip to content
This repository has been archived by the owner on Nov 28, 2022. It is now read-only.

Commit

Permalink
Merge pull request #26 from finos-voice/ageojo/segment
Browse files Browse the repository at this point in the history
segments.py
  • Loading branch information
mgoldey authored Jan 3, 2019
2 parents 160a2c5 + af1df17 commit 9e1c238
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 26 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ optional arguments:
```
This tool allows for easy conversion among file formats listed above.

Note: Attributes of a segment object not present in a parsed file retain their default values

- For example, a `segment` object is created for each line of an STM line
- each is initialized with the following default values which are not encoded in STM files: `formatted_text=''`; `confidence=1.0`



### wer
```text
usage: wer [-h] [--char-level] [--ignore-nsns]
Expand Down
2 changes: 1 addition & 1 deletion asrtoolkit/data_handlers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def footer():
def format_segment(seg):
"""
Formats a segment assuming it's an instance of class segment with elements
audiofile, channel, speaker, start and stop times, label, and text
filename, channel, speaker, start and stop times, label, and text
"""

return "<tr>" + "".join(
Expand Down
43 changes: 36 additions & 7 deletions asrtoolkit/data_handlers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@ def footer():
def format_segment(seg):
"""
Formats a segment assuming it's an instance of class segment with elements
audiofile, channel, speaker, start and stop times, label, and text
filename, channel, speaker, start and stop times, label, and text
:param: seg: segment object
:return: dict: key/val pairs contain 'segment'-level information
values of output segment-dict are values of corresponding input segment attributes
output_dict['startTimeSec'] = segment.start (converse of parse_segment)
the segment dict structure generated is the output of all microservices and input to most
each segment-dict contains a fragment of text (+ additional information)
"""
output_dict = {}
output_dict['speakerInfo'] = seg.speaker
Expand All @@ -33,22 +42,33 @@ def format_segment(seg):
output_dict['genderInfo'] = {'gender': seg.label.split(",")[-1].replace(">", "")}
output_dict['punctuated_transcript'] = seg.formatted_text
output_dict['transcript'] = seg.text
output_dict['confidence'] = seg.confidence

return json.dumps(output_dict, ensure_ascii=True)


def parse_segment(input_seg):
"""
Creates a segment object from an input GreenKey segment
"""
:param: input_seg: dict (segment-level dict: input_data['segments'][i]['segment'])
-> dict with keys 'channel', 'startTimeSec' etc mapping to segment object attributes
:return: segment object; attribute values are set to those of corresponding segment-dict keys
segment.start = segment_dict['startTimeSec'] (reverse mapping from format_segment)
"""
extracted_dict = {}

def assign_if_present(value, dict_key=None, interior_key=None, proc_val=lambda val: val):
"""
:param value: type?
:param dict_key:
:param interior_key:
:param proc_val:
:return: type?
Assigns value to extracted_dict object if present in input_seg
"""

dict_key = value if dict_key is None else dict_key

if value in input_seg and interior_key and interior_key in input_seg[value]:
Expand All @@ -68,20 +88,31 @@ def assign_if_present(value, dict_key=None, interior_key=None, proc_val=lambda v
assign_if_present('punctuated_transcript', 'formatted_text')
assign_if_present('speakerInfo', 'speaker', 'ID')
assign_if_present('genderInfo', 'label', 'gender', lambda gender: "<o,f0,{:}>".format(gender))
assign_if_present('confidence', 'confidence')

seg = segment(extracted_dict)

except Exception as exc:
print(exc)
print(exc) #TODO log instead

return seg if seg and seg.validate() else None


def read_in_memory(input_data):
"""
Reads input json objects
:param: input_data: dict with key 'segments'
input_data['segments']: List[Dict]; each dict has key 'segment' with a dict as the value (segment_dict)
- segment_dicts contain key/val pairs that map to `segment` object attributes
- labels of mapped key-attribute pairs may differ: segment['startTimeSec'] -> segment.start
:return: list of segment objects
applies parse_segment function to each dict in input_data['segments']
- func creates a `segment` object for each segment_dict, mapping corresponding attributes
input_data['segments'][i]['segment'] --> mapped to ith segment object (with 'start', 'stop' etc'
"""
segments = []
segments = [_ for _ in map(parse_segment, input_data['segments']) if _ is not None]
return segments

Expand All @@ -90,9 +121,7 @@ def read_file(file_name):
"""
Reads a JSON file, skipping any bad segments
"""
segments = []
with open(file_name, encoding="utf-8") as f:
input_json = json.load(f)
segments = read_in_memory(input_json)

return segments
2 changes: 1 addition & 1 deletion asrtoolkit/data_handlers/srt.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def format_segment(seg):
"""
Formats a segment assuming it's an instance of class segment with elements
audiofile, channel, speaker, start and stop times, label, and text
filename, channel, speaker, start and stop times, label, and text
"""

ret_str = "1\n{:} --> {:}\n".format(seconds_to_timestamp(seg.start), seconds_to_timestamp(seg.stop)).replace(".", ",")
Expand Down
22 changes: 13 additions & 9 deletions asrtoolkit/data_handlers/stm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,37 @@

def format_segment(seg):
"""
:param seg: segment object
:return str: text for a particular STM line (see segment __str__ method)
Formats a segment assuming it's an instance of class segment with elements
audiofile, channel, speaker, start and stop times, label, and text
filename, channel, speaker, start and stop times, label, and text
"""
return " ".join(
[str(seg.__dict__[_]) for _ in ('audiofile', 'channel', 'speaker', 'start', 'stop', 'label')] +
[clean_up(seg.__dict__['text'])]
[str(seg.__dict__[_]) for _ in ('filename', 'channel', 'speaker', 'start', 'stop', 'label')] +
[clean_up(seg.__dict__['text'])] # clean_up used to unformat stm file text
)


def parse_line(line):
" parse a single line of an stm file"

"""
:param line: str; a single line of an stm file
:return: segment object if STM file line contains accurately formatted data; else None
"""
data = line.strip().split()

seg = None
if len(data) > 6:
audiofile, channel, speaker, start, stop, label = data[:6]
filename, channel, speaker, start, stop, label = data[:6]
text = " ".join(data[6:])
seg = segment(
{
'audiofile': audiofile,
'filename': filename,
'channel': channel,
'speaker': speaker,
'start': start,
'stop': stop,
'label': label,
'text': text
'text': text,
}
)
return seg if seg and seg.validate() else None
Expand All @@ -51,12 +55,12 @@ def parse_line(line):
def read_file(file_name):
"""
Reads an STM file, skipping any gap lines
:return: list of segment objects
"""
segments = []
with open(file_name, encoding="utf-8") as f:
for line in f:
seg = parse_line(line)
if seg is not None:
segments.append(seg)

return segments
2 changes: 1 addition & 1 deletion asrtoolkit/data_handlers/vtt.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def header():
def format_segment(seg):
"""
Formats a segment assuming it's an instance of class segment with elements
audiofile, channel, speaker, start and stop times, label, and text
filename, channel, speaker, start and stop times, label, and text
"""

ret_str = "{:} --> {:}".format(seconds_to_timestamp(seg.start), seconds_to_timestamp(seg.stop))
Expand Down
28 changes: 21 additions & 7 deletions asrtoolkit/data_structures/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,15 @@ def clean_float(input_float):
class segment(object):
"""
Class for holding segment-specific information
segment objects corresponds to dict under the key 'segment'
in the ASR generated transcript (lattice)
- the fields included below are shared across 'segments'
but 'segments' may contain many other fields (i.e. sentiment) depending on
the the text processing pipeline selected.
"""

# refer to some file if possible
audiofile = "unknown"
filename = "unknown"
# by default, use channel 1
channel = "1"
# need a speaker id
Expand All @@ -58,23 +63,32 @@ class segment(object):
text = ""
# text for printing out to fancy output formats
formatted_text = ""
# confidence in accuracy of text
confidence = 1.0

def __init__(self, input_dict=None):
"""
Stores and initializes audiofile, channel, speaker, start & stop times, label, and text
Stores and initializes filename, channel, speaker, start & stop times, label,
and formatted and unformatted text fields.
- Unmodified ASR transcripts are unformatted text.
- Raw Chat data is formatted text;
`clean_up` from asrtoolkit.clean_formatting is used to convert it to unformatted text
Note: `channel` (as currently defined) applies only to audio input
- all chat data will retain default value of '1'
>>> seg = segment({"text":"this is a test"})
"""
self.__dict__ = {
'audiofile': self.audiofile,
'filename': self.filename,
'channel': self.channel,
'speaker': self.speaker,
'start': self.start,
'stop': self.stop,
'label': self.label,
'text': self.text,
'formatted_text': self.formatted_text
'formatted_text': self.formatted_text,
'confidence': self.confidence
}
self.__dict__.update(input_dict if input_dict else {})

Expand Down Expand Up @@ -105,14 +119,14 @@ def validate(self):
valid = False
print(exc)

if not valid:
if not valid: #TODO log instead of print
print(
"Skipping segment due to validation error. \nPlease note that this invalidates WER calculations based on the entire file.\nSegment: ",
json.dumps(self.__dict__)
)

if "-" in self.audiofile:
self.audiofile = self.audiofile.replace("-", "_")
if "-" in self.filename:
self.filename = self.filename.replace("-", "_")
print("Please rename audio file to replace hyphens with underscores")

return valid
Expand Down

0 comments on commit 9e1c238

Please sign in to comment.