Skip to content

Commit

Permalink
Merge pull request #315 from pbs/OCTO-10874-ascii-duplicates
Browse files Browse the repository at this point in the history
OCTO-10874-ascii-duplicates
  • Loading branch information
OlteanuRares authored Dec 12, 2023
2 parents 4fe2335 + 0a97c3f commit a37b8a4
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 9 deletions.
62 changes: 62 additions & 0 deletions pycaption/scc/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from itertools import product
from collections import defaultdict

COMMANDS = {
'9420': '',
Expand Down Expand Up @@ -985,3 +986,64 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):


HEADER = 'Scenarist_SCC V1.0'

# taken from
# http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/CC_CHARS.HTML
INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION = {
'¡': "!", # inverted exclamation mark
'¤': "C", # currency
'¥': "Y", # yen
'¦': "-", # broken bar
'©': "c", # copyright sign
'«': '"', # left pointing double angle quotation mark
'»': '"', # right pointing double angle quotation mark
'À': "A",
'Á': "A",
'Â': "A",
'Ã': "A",
'Ä': "A",
'Å': "A",
'Ç': "C",
'È': "E",
'É': "E",
'Ê': "E",
'Ë': "E",
'Ì': "I",
'Í': "I",
'Î': "I",
'Ï': "I",
'Ò': "O",
'Ó': "O",
'Ô': ")",
'Õ': "O",
'Ö': "O",
'Ø': "O",
'Ù': "U",
'Ú': "U",
'Û': "U",
'Ü': "U",
'ß': "s",
'ã': "a",
'ä': "a",
'å': "a",
'ë': "e",
'ì': "i",
'ï': "i",
'ò': "o",
'õ': "o",
'ö': "o",
'ø': "o",
'ù': "u",
'ü': "u",
'—': "-", # em dash
'‘': "'",
'’': "'",
'“': '"',
'”': '"',
'•': ".",
'℠': "s",
'┌': "+",
'┐': "+",
'└': "+",
'┘': "+"
}
21 changes: 16 additions & 5 deletions pycaption/scc/specialized_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
)
from .constants import (
PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS,
MICROSECONDS_PER_CODEWORD,
MICROSECONDS_PER_CODEWORD, INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION
)

PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")
Expand Down Expand Up @@ -423,14 +423,25 @@ def remove_ascii_duplicate(self, accented_character):
:type accented_character: str
"""
if self._collection and self._collection[-1].is_text_node() and \
self._collection[-1].text:
ascii_char = unicodedata.normalize('NFD', accented_character)\
.encode('ascii', 'ignore').decode("utf-8")
is_text_node = (
self._collection and
self._collection[-1].is_text_node() and
self._collection[-1].text
)
if is_text_node:
try:
ascii_char = unicodedata.normalize('NFD', accented_character) \
.encode('ascii', 'strict').decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
ascii_char = INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION[
accented_character
]

if ascii_char and self._collection[-1].text[-1] == ascii_char:
self._collection[-1].text = self._collection[-1].text[:-1]



def _get_layout_from_tuple(position_tuple):
"""Create a Layout object from the positioning information given
Expand Down
3 changes: 3 additions & 0 deletions tests/fixtures/scc.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,9 @@ def sample_scc_with_extended_characters():
Scenarist_SCC V1.0
00:04:36;06 9420 942c 942f 9420 91d6 cdc1 13b0 5254 c8c1 ba80 942f
00:22:32:18 9420 942c 942f 9420 9454 97a1 4ad5 ce49 4f52 ba20 a180 92a7 d975 6da1 9470 9723 d961 206d e520 73e9
00:22:34:28 942c e56e f4ef 206d 75e3 68ef 206d e5ea eff2 ae80 9420 942c 942f 9420 94f2 9723 4ad5 ce49 4f52 ba20 4f79 e52c 20c1 ec6d 612c
"""


Expand Down
9 changes: 5 additions & 4 deletions tests/test_scc.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,11 @@ def test_timing_is_properly_set_on_split_captions(
def test_skip_extended_characters_ascii_duplicate(
self, sample_scc_with_extended_characters):
caption_set = SCCReader().read(sample_scc_with_extended_characters)
nodes = caption_set.get_captions('en-US')[0].nodes

assert nodes[0].content == 'MÄRTHA:'
captions = caption_set.get_captions('en-US')
assert captions[0].nodes[0].content == 'MÄRTHA:'
expected_result = ['JUNIOR: ¡Yum!', None, 'Ya me siento mucho mejor.']
content = [node.content for node in captions[1].nodes]
assert all(result in expected_result for result in content)

def test_skip_duplicate_tab_offset(self, sample_scc_duplicate_tab_offset):
expected_lines = [
Expand Down Expand Up @@ -272,7 +274,6 @@ def test_freeze_rollup_captions_contents(self, sample_scc_roll_up_ru2):
'And wildlife.',
'>> Bike Iowa, your source for',
]

assert expected_texts == actual_texts

def test_multiple_formats(self, sample_scc_multiple_formats):
Expand Down

0 comments on commit a37b8a4

Please sign in to comment.