Merge pull request #26 from finos-voice/ageojo/segment

segments.py
finos · Jan 3, 2019 · 9e1c238 · 9e1c238
2 parents 160a2c5 + af1df17
commit 9e1c238
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,13 @@ optional arguments:
 ```
 This tool allows for easy conversion among file formats listed above.
 
+Note: Attributes of a segment object not present in a parsed file retain their default values
+
+- For example, a `segment` object is created for each line of an STM line
+- each is initialized with the following default values which are not encoded in STM files: `formatted_text=''`;  `confidence=1.0` 
+
+
+
 ### wer
 ```text
 usage: wer [-h] [--char-level] [--ignore-nsns]

diff --git a/asrtoolkit/data_handlers/html.py b/asrtoolkit/data_handlers/html.py
@@ -41,7 +41,7 @@ def footer():
 def format_segment(seg):
   """
     Formats a segment assuming it's an instance of class segment with elements
-    audiofile, channel, speaker, start and stop times, label, and text
+    filename, channel, speaker, start and stop times, label, and text
   """
 
   return "<tr>" + "".join(

diff --git a/asrtoolkit/data_handlers/json.py b/asrtoolkit/data_handlers/json.py
@@ -24,7 +24,16 @@ def footer():
 def format_segment(seg):
   """
     Formats a segment assuming it's an instance of class segment with elements
-    audiofile, channel, speaker, start and stop times, label, and text
+    filename, channel, speaker, start and stop times, label, and text
+
+  :param: seg: segment object
+  :return: dict: key/val pairs contain 'segment'-level information
+    values of output segment-dict are values of corresponding input segment attributes
+
+      output_dict['startTimeSec'] = segment.start          (converse of parse_segment)
+
+    the segment dict structure generated is the output of all microservices and input to most
+      each segment-dict contains a fragment of text (+ additional information)
   """
   output_dict = {}
   output_dict['speakerInfo'] = seg.speaker
@@ -33,22 +42,33 @@ def format_segment(seg):
   output_dict['genderInfo'] = {'gender': seg.label.split(",")[-1].replace(">", "")}
   output_dict['punctuated_transcript'] = seg.formatted_text
   output_dict['transcript'] = seg.text
+  output_dict['confidence'] = seg.confidence
 
   return json.dumps(output_dict, ensure_ascii=True)
 
 
 def parse_segment(input_seg):
   """
     Creates a segment object from an input GreenKey segment
-  """
+  :param: input_seg: dict  (segment-level dict: input_data['segments'][i]['segment'])
+      -> dict with keys 'channel', 'startTimeSec' etc mapping to segment object attributes
+
+  :return: segment object; attribute values are set to those of corresponding segment-dict keys
 
+      segment.start = segment_dict['startTimeSec']         (reverse mapping from format_segment)
+  """
   extracted_dict = {}
 
   def assign_if_present(value, dict_key=None, interior_key=None, proc_val=lambda val: val):
     """
+    :param value: type?
+    :param dict_key:
+    :param interior_key:
+    :param proc_val:
+    :return: type?
+
       Assigns value to extracted_dict object if present in input_seg
     """
-
     dict_key = value if dict_key is None else dict_key
 
     if value in input_seg and interior_key and interior_key in input_seg[value]:
@@ -68,20 +88,31 @@ def assign_if_present(value, dict_key=None, interior_key=None, proc_val=lambda v
     assign_if_present('punctuated_transcript', 'formatted_text')
     assign_if_present('speakerInfo', 'speaker', 'ID')
     assign_if_present('genderInfo', 'label', 'gender', lambda gender: "<o,f0,{:}>".format(gender))
+    assign_if_present('confidence', 'confidence')
 
     seg = segment(extracted_dict)
 
   except Exception as exc:
-    print(exc)
+    print(exc)      #TODO log instead
 
   return seg if seg and seg.validate() else None
 
 
 def read_in_memory(input_data):
   """
     Reads input json objects
+
+  :param: input_data: dict with key 'segments'
+    input_data['segments']: List[Dict]; each dict has key 'segment' with a dict as the value (segment_dict)
+    - segment_dicts contain key/val pairs that map to `segment` object attributes
+    - labels of mapped key-attribute pairs may differ: segment['startTimeSec'] -> segment.start
+
+  :return: list of segment objects
+    applies parse_segment function to each dict in input_data['segments']
+     - func creates a `segment` object for each segment_dict, mapping corresponding attributes
+
+  input_data['segments'][i]['segment'] --> mapped to ith segment object (with 'start', 'stop' etc'
   """
-  segments = []
   segments = [_ for _ in map(parse_segment, input_data['segments']) if _ is not None]
   return segments
 
@@ -90,9 +121,7 @@ def read_file(file_name):
   """
     Reads a JSON file, skipping any bad segments
   """
-  segments = []
   with open(file_name, encoding="utf-8") as f:
     input_json = json.load(f)
     segments = read_in_memory(input_json)
-
   return segments
diff --git a/asrtoolkit/data_handlers/srt.py b/asrtoolkit/data_handlers/srt.py
@@ -16,7 +16,7 @@
 def format_segment(seg):
   """
     Formats a segment assuming it's an instance of class segment with elements
-    audiofile, channel, speaker, start and stop times, label, and text
+    filename, channel, speaker, start and stop times, label, and text
   """
 
   ret_str = "1\n{:} --> {:}\n".format(seconds_to_timestamp(seg.start), seconds_to_timestamp(seg.stop)).replace(".", ",")

diff --git a/asrtoolkit/data_handlers/stm.py b/asrtoolkit/data_handlers/stm.py
@@ -16,33 +16,37 @@
 
 def format_segment(seg):
   """
+  :param seg: segment object
+  :return str: text for a particular STM line (see segment __str__ method)
     Formats a segment assuming it's an instance of class segment with elements
-    audiofile, channel, speaker, start and stop times, label, and text
+    filename, channel, speaker, start and stop times, label, and text
   """
   return " ".join(
-    [str(seg.__dict__[_]) for _ in ('audiofile', 'channel', 'speaker', 'start', 'stop', 'label')] +
-    [clean_up(seg.__dict__['text'])]
+    [str(seg.__dict__[_]) for _ in ('filename', 'channel', 'speaker', 'start', 'stop', 'label')] +
+    [clean_up(seg.__dict__['text'])]  # clean_up used to unformat stm file text
   )
 
 
 def parse_line(line):
-  " parse a single line of an stm file"
-
+  """
+  :param line: str; a single line of an stm file
+  :return: segment object if STM file line contains accurately formatted data; else None
+  """
   data = line.strip().split()
 
   seg = None
   if len(data) > 6:
-    audiofile, channel, speaker, start, stop, label = data[:6]
+    filename, channel, speaker, start, stop, label = data[:6]
     text = " ".join(data[6:])
     seg = segment(
       {
-        'audiofile': audiofile,
+        'filename': filename,
         'channel': channel,
         'speaker': speaker,
         'start': start,
         'stop': stop,
         'label': label,
-        'text': text
+        'text': text,
       }
     )
   return seg if seg and seg.validate() else None
@@ -51,12 +55,12 @@ def parse_line(line):
 def read_file(file_name):
   """
     Reads an STM file, skipping any gap lines
+    :return: list of segment objects
   """
   segments = []
   with open(file_name, encoding="utf-8") as f:
     for line in f:
       seg = parse_line(line)
       if seg is not None:
         segments.append(seg)
-
   return segments
diff --git a/asrtoolkit/data_handlers/vtt.py b/asrtoolkit/data_handlers/vtt.py
@@ -21,7 +21,7 @@ def header():
 def format_segment(seg):
   """
     Formats a segment assuming it's an instance of class segment with elements
-    audiofile, channel, speaker, start and stop times, label, and text
+    filename, channel, speaker, start and stop times, label, and text
   """
 
   ret_str = "{:} --> {:}".format(seconds_to_timestamp(seg.start), seconds_to_timestamp(seg.stop))

diff --git a/asrtoolkit/data_structures/segment.py b/asrtoolkit/data_structures/segment.py
@@ -40,10 +40,15 @@ def clean_float(input_float):
 class segment(object):
   """
   Class for holding segment-specific information
+    segment objects corresponds to dict under the key 'segment'
+    in the ASR generated transcript (lattice)
+    - the fields included below are shared across 'segments'
+    but 'segments' may contain many other fields (i.e. sentiment) depending on
+    the the text processing pipeline selected.
   """
 
   # refer to some file if possible
-  audiofile = "unknown"
+  filename = "unknown"
   # by default, use channel 1
   channel = "1"
   # need a speaker id
@@ -58,23 +63,32 @@ class segment(object):
   text = ""
   # text for printing out to fancy output formats
   formatted_text = ""
+  # confidence in accuracy of text
+  confidence = 1.0
 
   def __init__(self, input_dict=None):
     """
-    Stores and initializes audiofile, channel, speaker,  start & stop times, label, and text
+    Stores and initializes filename, channel, speaker, start & stop times, label,
+    and formatted and unformatted text fields.
+    - Unmodified ASR transcripts are unformatted text.
+    - Raw Chat data is formatted text;
+      `clean_up` from asrtoolkit.clean_formatting is used to convert it to unformatted text
+    Note: `channel` (as currently defined) applies only to audio input
+      - all chat data will retain default value of '1'
 
     >>> seg = segment({"text":"this is a test"})
 
     """
     self.__dict__ = {
-      'audiofile': self.audiofile,
+      'filename': self.filename,
       'channel': self.channel,
       'speaker': self.speaker,
       'start': self.start,
       'stop': self.stop,
       'label': self.label,
       'text': self.text,
-      'formatted_text': self.formatted_text
+      'formatted_text': self.formatted_text,
+      'confidence': self.confidence
     }
     self.__dict__.update(input_dict if input_dict else {})
 
@@ -105,14 +119,14 @@ def validate(self):
       valid = False
       print(exc)
 
-    if not valid:
+    if not valid:   #TODO log instead of print
       print(
         "Skipping segment due to validation error. \nPlease note that this invalidates WER calculations based on the entire file.\nSegment: ",
         json.dumps(self.__dict__)
       )
 
-    if "-" in self.audiofile:
-      self.audiofile = self.audiofile.replace("-", "_")
+    if "-" in self.filename:
+      self.filename = self.filename.replace("-", "_")
       print("Please rename audio file to replace hyphens with underscores")
 
     return valid