diff --git a/ebl/atf_importer/domain/atf_preprocessor.py b/ebl/atf_importer/domain/atf_preprocessor.py index 4815c3172..93b29e24a 100644 --- a/ebl/atf_importer/domain/atf_preprocessor.py +++ b/ebl/atf_importer/domain/atf_preprocessor.py @@ -75,9 +75,8 @@ def transform_legacy_atf(self, tree: Tree) -> Tree: visitor = LegacyAtfVisitor() visitor.visit(tree) print('!!!! visitor.legacy_found', visitor.legacy_found) - if visitor.legacy_found==True: + if visitor.legacy_found: self.logger.info("Legacy line successfully parsed") - print(tree.pretty) return tree def parse_and_convert_line( diff --git a/ebl/atf_importer/domain/legacy_atf_visitor.py b/ebl/atf_importer/domain/legacy_atf_visitor.py index 17152320a..cdef34e91 100644 --- a/ebl/atf_importer/domain/legacy_atf_visitor.py +++ b/ebl/atf_importer/domain/legacy_atf_visitor.py @@ -1,6 +1,7 @@ import re from typing import Optional, Sequence, Callable from lark.visitors import Visitor, Transformer, Tree, Token, v_args +from ebl.transliteration.domain.atf import _SUB_SCRIPT # ToDo: Continue from here # Make sure every transformer is implemented and works properly. @@ -16,6 +17,9 @@ def __init__(self): super().__init__() self.legacy_found = False + def clear(self): + self.legacy_found = False + class HalfBracketsTransformer(LegacyTransformer): # ToDo: Check if works @@ -23,6 +27,10 @@ class HalfBracketsTransformer(LegacyTransformer): def __init__(self): self.open = False + def clear(self): + self.legacy_found = False + self.open = False + @v_args(inline=True) def ebl_atf_text_line__LEGACY_OPEN_HALF_BRACKET(self, bracket: str) -> str: print("! bbbbbb", bracket) @@ -84,72 +92,107 @@ class AccentedIndexTransformer(LegacyTransformer): "Ì": "I", "Ù": "U", } - patterns = ((re.compile("[áéíúÁÉÍÚ]"), "₂"), (re.compile("[àèìùÀÈÌÙ]"), "₃")) + accented_index_patterns = ( + (re.compile("[áéíúÁÉÍÚ]"), "₂"), + (re.compile("[àèìùÀÈÌÙ]"), "₃"), + ) def __init__(self, **kwargs): super().__init__(**kwargs) self.sub_index = None + def clear(self): + self.legacy_found = False + self.sub_index = None + @v_args(inline=True) def ebl_atf_text_line__VALUE_CHARACTER(self, char: str) -> str: if char in self.replacement_chars.keys(): - print("!!!!!!!!!!!!!!!!!!!! LEGACY_VALUE_CHARACTER", char) return self._transform_accented_vowel(char) return char @v_args(inline=True) def ebl_atf_text_line__LOGOGRAM_CHARACTER(self, char: str) -> str: if char in self.replacement_chars.keys(): - print("!!!!!!!!!!!!!!!!!!!! LEGACY_LOGOGRAM_CHARACTER", char) return self._transform_accented_vowel(char) return char @v_args(inline=True) - def ebl_atf_text_line__sub_index(self, char: Optional[str]) -> Optional[str]: - print("!!!!!!!!!!!!!!!!!!!! ebl_atf_text_line__sub_index") - return self.sub_index if self.sub_index and not char else char + def ebl_atf_text_line__sub_index(self, sub_index: Optional[str]) -> Optional[str]: + if sub_index and sub_index[0] in _SUB_SCRIPT.keys(): + self.legacy_found = True + self._set_sub_index("".join(_SUB_SCRIPT[digit] for digit in sub_index)) + return self.sub_index if self.sub_index else sub_index def _transform_accented_vowel(self, char: str) -> str: - self._set_sub_index(char) + self._set_sub_index_from_accented(char) self.legacy_found = True return self.replacement_chars[char] - def _set_sub_index(self, char: str) -> None: - for pattern, suffix in self.patterns: + def _set_sub_index_from_accented(self, char: str) -> None: + for pattern, sub_index in self.accented_index_patterns: if pattern.search(char): - self.sub_index = suffix + self._set_sub_index(sub_index) break + def _set_sub_index(self, sub_index: str) -> None: + self.sub_index = Tree( + "ebl_atf_text_line__sub_index", + [Token("ebl_atf_text_line__SUB_INDEX", sub_index)], + ) + -accented_index_transformer = AccentedIndexTransformer() -half_brackets_transformer = HalfBracketsTransformer() -oracc_joiner_transformer = OraccJoinerTransformer() -oracc_special_transformer = OraccSpecialTransformer() +index_and_accented_transformer = (AccentedIndexTransformer(), "all_children") +half_brackets_transformer = (HalfBracketsTransformer(), "first_child") +oracc_joiner_transformer = (OraccJoinerTransformer(), "first_child") +oracc_special_transformer = (OraccSpecialTransformer(), "first_child") class LegacyAtfVisitor(Visitor): - # ToDo: Continue from here. - # Move all atf preprocessing here - # ?Try to convert to string and then parse? text_line_prefix = "ebl_atf_text_line" tokens_to_visit = { - "number": [accented_index_transformer], - "reading": [accented_index_transformer], - "logogram": [accented_index_transformer, oracc_special_transformer], - "surrogate": [accented_index_transformer], - "GRAPHEME_NAME": [accented_index_transformer], + "number": [index_and_accented_transformer], + "reading": [index_and_accented_transformer], + "logogram": [index_and_accented_transformer, oracc_special_transformer], + "surrogate": [index_and_accented_transformer], + "grapheme": [index_and_accented_transformer], "_parts_pattern": [half_brackets_transformer], "_parts_pattern_gloss": [half_brackets_transformer], "LEGACY_ORACC_JOINER": [oracc_joiner_transformer], } + # ToDo: Fix nested `sign_index` within sign, as in `reading`. + """ + ebl_atf_text_line__word + ebl_atf_text_line__surrogate <-- ! Main parent + ebl_atf_text_line__logogram_name + ebl_atf_text_line__logogram_name_part + Š + U + ebl_atf_text_line__sub_index ₂ <-- ! The expected subindex + ebl_atf_text_line__modifiers + ebl_atf_text_line__flags + ebl_atf_text_line__surrogate_text + ebl_atf_text_line__reading + ebl_atf_text_line__value_name + ebl_atf_text_line__value_name_part + š + u + m + m + a + ebl_atf_text_line__sub_index ₂ <-- ! Problem here. Deeply nested second `sub_index` + ebl_atf_text_line__modifiers + ebl_atf_text_line__flags + None + """ + def __init__(self): super().__init__() self.legacy_found = False for suffix, transformers in self.tokens_to_visit.items(): self._set_rules(suffix, transformers) - - input("legacy visitor initiated") + input("LegacyAtfVisitor initiated") def _set_rules( self, @@ -161,28 +204,24 @@ def _set_rules( setattr( self, f"{prefix}__{suffix}", - self._wrap_legacy_found(transformers), + self._wrap_transformers(transformers), ) - def _wrap_legacy_found( + def _wrap_transformers( self, transformers: Sequence[LegacyTransformer] ) -> Callable[[Tree], None]: def _method(tree: Tree) -> Tree: - for transformer in transformers: - # ToDo: Continue from here. Top Priority. - # There is an error that likely has to do with - # the token (`tree`) element being added children - # disregarding the internal structure. - # A possible approach for complex transformers (such as `AccentedIndexTransformer`) - # might be saving the element as an attibute - # of the `LegacyTransformer` class, then extracting it, e.g.: - # transformer.transform(tree) - # tree.children[0] = transformer.result - # Make sure, however, that old results are not memorized: - # Either initiate new instances or (better?) renew them on each run. - tree.children[0] = transformer.transform(tree) - if transformer.legacy_found: - self.legacy_found = True - print("\nTransformed Tree:", tree.pretty()) + for transformer, replace in transformers: + self._transform(tree, transformer, replace) return _method + + def _transform(self, tree: Tree, transformer: LegacyTransformer, replace: str): + transformer.clear() + transformed = transformer.transform(tree) + if transformer.legacy_found: + self.legacy_found = True + if replace == "first_child": + tree.children[0] = transformed.children[0] + elif replace == "all_children": + tree.children = transformed.children diff --git a/ebl/tests/atf_importer/test_atf_preprocessor.py b/ebl/tests/atf_importer/test_atf_preprocessor.py index 10c41e8e2..97ec324e5 100644 --- a/ebl/tests/atf_importer/test_atf_preprocessor.py +++ b/ebl/tests/atf_importer/test_atf_preprocessor.py @@ -4,12 +4,16 @@ PROBLEMATIC_TEXT_LINES = [ ( - "1. ŠÚ", - "1. ŠU₂", + "1. ŠÚ ù ŠÚ<(šumma)> |ŠÚ+ŠÚ|", + "1. ŠU₂ u₃ ŠU₂<(šumma)> |ŠU₂+ŠU₂|", ), ( - "1. [*]", - "1. [DIŠ]", + "1. [*] * *-*", + "1. [DIŠ] DIŠ DIŠ-DIŠ", + ), + ( + "1. ŠU2 u3 ŠU2<(šumma)> |ŠU2+ŠU2|", + "1. ŠU₂ u₃ ŠU₂<(šumma)> |ŠU₂+ŠU₂|", ), ( "1. [*] AN#.GE₆ GAR-ma U₄ ŠÚ{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šú ŠUB{" @@ -58,11 +62,9 @@ def test_text_lines(legacy_line, ebl_line): legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree) expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line) - expected_tree = atf_preprocessor.transform_legacy_atf(expected_tree) - # (converted_line,) = atf_preprocessor.process_line(legacy_line) - # print(legacy_tree) - # print(expected_tree) + print('RESULT:\n', legacy_tree.pretty()) + print('EXPECTED:\n', expected_tree.pretty()) assert legacy_tree == expected_tree diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark index 7bc3f135c..1fec27864 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark @@ -32,4 +32,17 @@ LOGOGRAM_CHARACTER_MAIN: "A" | "Ā" | "Â" | "B" | "D" | "E" | "W" | "Z" | "Ḫ" | "ʾ" VALUE_CHARACTER: VALUE_CHARACTER_MAIN | LEGACY_VALUE_CHARACTER_ACCENTED -LOGOGRAM_CHARACTER: LOGOGRAM_CHARACTER_MAIN | LEGACY_LOGOGRAM_CHARACTER_ACCENTED \ No newline at end of file +LOGOGRAM_CHARACTER: LOGOGRAM_CHARACTER_MAIN | LEGACY_LOGOGRAM_CHARACTER_ACCENTED + +GREEK_ALPHABET: "Α" | "α" | "Β" | "β" | "Γ" | "γ" | "Δ" | "δ" | "Ε" | "ε" + | "Ζ" | "ζ" | "Η" | "η" | "Θ" | "θ" | "Ι" | "ι" | "Κ" | "κ" + | "Λ" | "λ" | "Μ" | "μ" | "Ν" | "ν" | "Ξ" | "ξ" | "Ο" | "ο" + | "Π" | "π" | "Ρ" | "ρ" | "Σ" | "σ" | "ς" | "Τ" | "τ" | "Υ" + | "υ" | "Φ" | "φ" | "Χ" | "χ" | "Ψ" | "ψ" | "Ω" | "ω" + +AKKADIAN_ALPHABET: "ʾ" | "A" | "B" | "D" | "E" | "G" | "H" | "I" | "K" | "L" + | "M" | "N" | "P" | "S" | "T" | "U" | "Y" | "Z" | "a" | "b" + | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "k" | "l" | "m" + | "n" | "p" | "q" | "r" | "s" | "t" | "u" | "w" | "y" | "z" + | "É" | "â" | "ê" | "î" | "û" | "ā" | "Ē" | "ē" | "ī" | "Š" + | "š" | "Ś" | "ś" | "ū" | "ṣ" | "ṭ" | "₄" \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark index 438dd7288..fae62bf3f 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark @@ -6,6 +6,7 @@ %import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge) %import .ebl_atf_common (seal) %import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER) +%import .ebl_atf_common (AKKADIAN_ALPHABET, GREEK_ALPHABET) %import .ebl_atf_abbreviations (PERIOD) %import .legacy_atf (LEGACY_OPEN_HALF_BRACKET, LEGACY_CLOSE_HALF_BRACKET) %import .legacy_atf (LEGACY_ORACC_JOINER, LEGACY_ORACC_DISH_DIVIDER) @@ -222,7 +223,6 @@ reading: value_name sub_index modifiers flags ["(" any_grapheme ")"] logogram: logogram_name sub_index modifiers flags ["(" any_grapheme ")"] surrogate: logogram_name sub_index modifiers flags "<(" surrogate_text ")>" surrogate_text: reading (joiner reading)* -sub_index: [SUB_INDEX] number_name: number_name_head (broken_away number_name_part)* value_name: value_name_part (broken_away value_name_part)* @@ -233,20 +233,20 @@ value_name_part: VALUE_CHARACTER+ logogram_name_part: LOGOGRAM_CHARACTER+ ?any_grapheme: compound_grapheme | grapheme -compound_grapheme: "|" COMPOUND_FRAGMENT ("." COMPOUND_FRAGMENT)* "|" -COMPOUND_FRAGMENT: SUB_COMPOUND (COMPOUND_OPERATOR SUB_COMPOUND)* -SUB_COMPOUND: "(" COMPOUND_PART ((COMPOUND_OPERATOR) COMPOUND_PART)* ")" - | COMPOUND_PART -COMPOUND_PART: GRAPHEME (VARIANT_SEPARATOR GRAPHEME)* +compound_grapheme: "|" compound_fragment ("." compound_fragment)* "|" +?compound_fragment: sub_compound (COMPOUND_OPERATOR sub_compound)* +?sub_compound: "(" compound_part ((COMPOUND_OPERATOR) compound_part)* ")" + | compound_part +?compound_part: grapheme (VARIANT_SEPARATOR grapheme)* COMPOUND_OPERATOR: "×" | "%" | "&" | "+" | "." -grapheme: GRAPHEME_NAME modifiers flags -GRAPHEME: GRAPHEME_NAME MODIFIER* FLAG -GRAPHEME_NAME: GRAPHEME_CHARACTER (GRAPHEME_CHARACTER)* SUB_INDEX? -GRAPHEME_CHARACTER: VALUE_CHARACTER | LOGOGRAM_CHARACTER | "0".."9" +grapheme: grapheme_name sub_index modifiers flags +grapheme_name: grapheme_name_part +grapheme_name_part: VALUE_CHARACTER+ | LOGOGRAM_CHARACTER+ +sub_index: [SUB_INDEX] SUB_INDEX: NUMERIC_SUB_INDEX | "ₓ" -NUMERIC_SUB_INDEX: "₀" | "₁".."₉" ("₀".."₉")* +NUMERIC_SUB_INDEX: "₀" | "₁".."₉" ("₀".."₉")* | "1".."9" ("0".."9")* unidentified_sign: "X" flags unclear_sign: "x" flags @@ -310,12 +310,6 @@ open_emendation: "<" close_emendation: ">" akkadian_string: AKKADIAN_ALPHABET+ -AKKADIAN_ALPHABET: "ʾ" | "A" | "B" | "D" | "E" | "G" | "H" | "I" | "K" | "L" - | "M" | "N" | "P" | "S" | "T" | "U" | "Y" | "Z" | "a" | "b" - | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "k" | "l" | "m" - | "n" | "p" | "q" | "r" | "s" | "t" | "u" | "w" | "y" | "z" - | "É" | "â" | "ê" | "î" | "û" | "ā" | "Ē" | "ē" | "ī" | "Š" - | "š" | "Ś" | "ś" | "ū" | "ṣ" | "ṭ" | "₄" _greek: greek_token (_WORD_SEPARATOR greek_token)* ?greek_token: greek_word @@ -327,12 +321,6 @@ greek_word: (greek_enclosure | greek_word_part)* greek_word_part (greek_enclosur | unknown_number_of_signs ?greek_enclosure: _any_open | _any_close greek_letter: GREEK_ALPHABET flags -GREEK_ALPHABET: "Α" | "α" | "Β" | "β" | "Γ" | "γ" | "Δ" | "δ" | "Ε" | "ε" - | "Ζ" | "ζ" | "Η" | "η" | "Θ" | "θ" | "Ι" | "ι" | "Κ" | "κ" - | "Λ" | "λ" | "Μ" | "μ" | "Ν" | "ν" | "Ξ" | "ξ" | "Ο" | "ο" - | "Π" | "π" | "Ρ" | "ρ" | "Σ" | "σ" | "ς" | "Τ" | "τ" | "Υ" - | "υ" | "Φ" | "φ" | "Χ" | "χ" | "Ψ" | "ψ" | "Ω" | "ω" - labels: surface_label " " column_label | surface_label