Merge pull request #315 from pbs/OCTO-10874-ascii-duplicates

OCTO-10874-ascii-duplicates
pbs · Dec 12, 2023 · a37b8a4 · a37b8a4
2 parents 4fe2335 + 0a97c3f
commit a37b8a4
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 9 deletions.
diff --git a/pycaption/scc/constants.py b/pycaption/scc/constants.py
@@ -1,4 +1,5 @@
 from itertools import product
+from collections import defaultdict
 
 COMMANDS = {
     '9420': '',
@@ -985,3 +986,64 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):
 
 
 HEADER = 'Scenarist_SCC V1.0'
+
+# taken from
+# http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/CC_CHARS.HTML
+INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION = {
+    '¡': "!",   # inverted exclamation mark
+     '¤': "C",  # currency
+     '¥': "Y",  # yen
+     '¦': "-",  # broken bar
+     '©': "c",  # copyright sign
+     '«': '"',  # left pointing double angle quotation mark
+     '»': '"',  # right pointing double angle quotation mark
+     'À': "A",
+     'Á': "A",
+     'Â': "A",
+     'Ã': "A",
+     'Ä': "A",
+     'Å': "A",
+     'Ç': "C",
+     'È': "E",
+     'É': "E",
+     'Ê': "E",
+     'Ë': "E",
+     'Ì': "I",
+     'Í': "I",
+     'Î': "I",
+     'Ï': "I",
+     'Ò': "O",
+     'Ó': "O",
+     'Ô': ")",
+     'Õ': "O",
+     'Ö': "O",
+     'Ø': "O",
+     'Ù': "U",
+     'Ú': "U",
+     'Û': "U",
+     'Ü': "U",
+     'ß': "s",
+     'ã': "a",
+     'ä': "a",
+     'å': "a",
+     'ë': "e",
+     'ì': "i",
+     'ï': "i",
+     'ò': "o",
+     'õ': "o",
+     'ö': "o",
+     'ø': "o",
+     'ù': "u",
+     'ü': "u",
+     '—': "-",  # em dash
+     '‘': "'",
+     '’': "'",
+     '“': '"',
+     '”': '"',
+     '•': ".",
+     '℠': "s",
+     '┌': "+",
+     '┐': "+",
+     '└': "+",
+     '┘': "+"
+}
diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py
@@ -8,7 +8,7 @@
 )
 from .constants import (
     PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS,
-    MICROSECONDS_PER_CODEWORD,
+    MICROSECONDS_PER_CODEWORD, INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION
 )
 
 PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")
@@ -423,14 +423,25 @@ def remove_ascii_duplicate(self, accented_character):
 
         :type accented_character: str
         """
-        if self._collection and self._collection[-1].is_text_node() and \
-                self._collection[-1].text:
-            ascii_char = unicodedata.normalize('NFD', accented_character)\
-                .encode('ascii', 'ignore').decode("utf-8")
+        is_text_node = (
+                self._collection and
+                self._collection[-1].is_text_node() and
+                self._collection[-1].text
+                )
+        if is_text_node:
+            try:
+                ascii_char = unicodedata.normalize('NFD', accented_character) \
+                    .encode('ascii', 'strict').decode("utf-8")
+            except (UnicodeEncodeError, UnicodeDecodeError):
+                ascii_char = INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION[
+                    accented_character
+                ]
+
             if ascii_char and self._collection[-1].text[-1] == ascii_char:
                 self._collection[-1].text = self._collection[-1].text[:-1]
 
 
+
 def _get_layout_from_tuple(position_tuple):
     """Create a Layout object from the positioning information given
 

diff --git a/tests/fixtures/scc.py b/tests/fixtures/scc.py
@@ -314,6 +314,9 @@ def sample_scc_with_extended_characters():
 Scenarist_SCC V1.0
 
 00:04:36;06	9420 942c 942f 9420 91d6 cdc1 13b0 5254 c8c1 ba80 942f
+00:22:32:18	9420 942c 942f 9420 9454 97a1 4ad5 ce49 4f52 ba20 a180 92a7 d975 6da1 9470 9723 d961 206d e520 73e9
+
+00:22:34:28	942c e56e f4ef 206d 75e3 68ef 206d e5ea eff2 ae80 9420 942c 942f 9420 94f2 9723 4ad5 ce49 4f52 ba20 4f79 e52c 20c1 ec6d 612c
 """
 
 

diff --git a/tests/test_scc.py b/tests/test_scc.py
@@ -193,9 +193,11 @@ def test_timing_is_properly_set_on_split_captions(
     def test_skip_extended_characters_ascii_duplicate(
             self, sample_scc_with_extended_characters):
         caption_set = SCCReader().read(sample_scc_with_extended_characters)
-        nodes = caption_set.get_captions('en-US')[0].nodes
-
-        assert nodes[0].content == 'MÄRTHA:'
+        captions = caption_set.get_captions('en-US')
+        assert captions[0].nodes[0].content == 'MÄRTHA:'
+        expected_result = ['JUNIOR: ¡Yum!', None, 'Ya me siento mucho mejor.']
+        content = [node.content for node in captions[1].nodes]
+        assert all(result in expected_result for result in content)
 
     def test_skip_duplicate_tab_offset(self, sample_scc_duplicate_tab_offset):
         expected_lines = [
@@ -272,7 +274,6 @@ def test_freeze_rollup_captions_contents(self, sample_scc_roll_up_ru2):
             'And wildlife.',
             '>> Bike Iowa, your source for',
         ]
-
         assert expected_texts == actual_texts
 
     def test_multiple_formats(self, sample_scc_multiple_formats):