diff --git a/ebl/atf_importer/domain/atf_conversions.py b/ebl/atf_importer/domain/atf_conversions.py index 0604791b7..688efb6e9 100644 --- a/ebl/atf_importer/domain/atf_conversions.py +++ b/ebl/atf_importer/domain/atf_conversions.py @@ -80,7 +80,7 @@ def __init__(self): self.alter_lem_line_at = [] self.removal_open = False - def ebl_atf_text_line__word(self, tree): + def ebl_atf_text_line__word(self, tree: Tree) -> None: assert tree.data == "ebl_atf_text_line__word" word = "" @@ -91,7 +91,7 @@ def ebl_atf_text_line__word(self, tree): self.removal_open = False self.alter_lem_line_at.append(self.wordcounter) elif isinstance(child, lexer.Token): - word += child + word += child # ToDo: Check, fix type error else: word += DepthFirstSearch().visit_topdown(child, "") diff --git a/ebl/atf_importer/domain/atf_preprocessor.py b/ebl/atf_importer/domain/atf_preprocessor.py index 55e661112..158134dc8 100644 --- a/ebl/atf_importer/domain/atf_preprocessor.py +++ b/ebl/atf_importer/domain/atf_preprocessor.py @@ -5,6 +5,7 @@ from ebl.atf_importer.domain.atf_preprocessor_util import Util from ebl.atf_importer.domain.atf_conversions import GetWords from ebl.atf_importer.domain.legacy_atf_visitor import LegacyAtfVisitor +# from ebl.transliteration.domain.line_transformer import LineTransformer class AtfPreprocessor(AtfPreprocessorBase): @@ -52,13 +53,16 @@ def process_line( return self.parse_and_convert_line(atf_line) def check_original_line(self, atf: str) -> Tuple[str, List[Any], str, List[Any]]: - input(f"! check_original_line. [{atf}]") + print("! check_original_line.") if self.style == 2 and atf[0] == "#" and atf[1] == " ": atf = atf.replace("#", "#note:") atf = atf.replace("# note:", "#note:") - input("! before parse") + # input(f"! before parse:\n{atf}") tree = self.ebl_parser.parse(atf) - input("! before transform") + # print(tree.pretty()) + # input(f"! after parse:\n{self.line_tree_to_string(tree)}") + # input("! before transform") + # input("! after transform") tree = self.transform_legacy_atf(tree) self.logger.info("Line successfully parsed") self.logger.debug(f"Parsed line as {tree.data}") @@ -81,7 +85,9 @@ def parse_and_convert_line( try: tree = self.ebl_parser.parse(atf) if tree.data in self.unused_lines: - result = self.get_empty_conversion(tree) + # result = self.get_empty_conversion(tree) + # ToDo: Check original + return tree elif tree.data == "lem_line": result = self.convert_lem_line(atf, tree) elif tree.data == "text_line": diff --git a/ebl/atf_importer/domain/atf_preprocessor_base.py b/ebl/atf_importer/domain/atf_preprocessor_base.py index 2f3a6d042..3ac1ea1b5 100644 --- a/ebl/atf_importer/domain/atf_preprocessor_base.py +++ b/ebl/atf_importer/domain/atf_preprocessor_base.py @@ -110,12 +110,6 @@ def __init__(self, logdir: str, style: int) -> None: # Previously: "lark-oracc/oracc_atf.lark", # This should be eventually removed completely. - # self.oracc_parser = Lark.open( - # "../../transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark", - # maybe_placeholders=True, - # rel_to=__file__, - # ) - self.logger = logging.getLogger("Atf-Preprocessor") self.logger.setLevel(logging.DEBUG) self.skip_next_lem_line = False @@ -169,13 +163,13 @@ def unused_line( return (None, None, tree.data, None) def convert_lem_line( - self, atf: str, tree + self, atf: str, tree: Tree ) -> Tuple[Optional[str], Optional[List[Any]], str, Optional[List[Any]]]: if self.skip_next_lem_line: self.logger.warning("Skipping lem line due to previous flag.") self.skip_next_lem_line = False return (None, None, "lem_line", None) - lemmas_and_guidewords_array = self.serizalize_lemmas_and_guidewords() + lemmas_and_guidewords_array = self.serizalize_lemmas_and_guidewords(tree) self.logger.debug( "Converted line as " + tree.data @@ -186,6 +180,7 @@ def convert_lem_line( return atf, lemmas_and_guidewords_array, tree.data, [] def line_tree_to_string(self, tree: Tree) -> str: + # ToDo: Remove line_serializer = LineSerializer() line_serializer.visit_topdown(tree) return line_serializer.line.strip(" ") @@ -240,11 +235,9 @@ def _replace_dashes(self, atf: str) -> str: return re.sub(r"–|--", "-", atf) def _normalize_patterns(self, atf: str) -> str: - callback_normalize = ( - lambda pat: pat.group(1) - + pat.group(2) - + self._normalize_numbers(pat.group(3)) - ) + def callback_normalize(pat): + return pat.group(1) + pat.group(2) + self._normalize_numbers(pat.group(3)) + return re.sub(r"(.*?)([a-zA-Z])(\d+)", callback_normalize, atf) def _replace_primed_digits(self, atf: str) -> str: @@ -283,6 +276,7 @@ def _handle_dollar_line(self, atf: str) -> str: def _process_bracketed_parts(self, atf: str) -> str: self.open_found = False split = re.split(r"([⌈⌉⸢⸣])", atf) + # ToDo: Remove: if len(split) > 1 and atf.startswith("9. ⸢4(BÁN)?⸣"): # ToDo: Continue from here. # Problem with `4(BÁN)#?`, which is not in lark grammer diff --git a/ebl/atf_importer/domain/legacy_atf_visitor.py b/ebl/atf_importer/domain/legacy_atf_visitor.py index 219fd15b9..e530d7e25 100644 --- a/ebl/atf_importer/domain/legacy_atf_visitor.py +++ b/ebl/atf_importer/domain/legacy_atf_visitor.py @@ -1,6 +1,6 @@ import re from typing import Optional, Sequence, Callable -from lark.visitors import Visitor, Transformer, Tree, v_args +from lark.visitors import Visitor, Transformer, Tree, Token, v_args # ToDo: Continue from here # Make sure every transformer is implemented and works properly. @@ -25,33 +25,33 @@ class LegacyAtfVisitor(Visitor): sign_rules = ["number", "reading", "logogram", "surrogate", "GRAPHEME_NAME"] legacy_damage_rules = ["_parts_pattern", "_parts_pattern_gloss"] legacy_joiner_rulers = ["LEGACY_ORACC_JOINER"] - legacy_special_rulers = ["LEGACY_ORACC_DISH_SIGN"] + legacy_special_rulers = ["logogram"] def __init__(self): super().__init__() self.legacy_found = False - self._set_rules(self.sign_rules, self.transform_legacy_sign) + self._set_rules(self.sign_rules, self.get_legacy_sign_transformer) self._set_rules( self.legacy_damage_rules, - self.transform_legacy_damage, + self.get_legacy_damage_transformer, ) self._set_rules( self.legacy_joiner_rulers, - self.transform_legacy_joiner, + self.get_legacy_joiner_transformer, ) self._set_rules( - self.legacy_divider_rulers, - self.transform_legacy_special, + self.legacy_special_rulers, + self.get_legacy_special_transformer, ) input("legacy visitor initiated") def _set_rules( self, rules: Sequence[str], - method: Callable[[Tree], Transformer], - prefix: Optional[str], + method: Callable[[Tree], LegacyTransformer], + prefix: Optional[str] = None, ) -> None: - prefix = perfix if prefix else self.text_line_prefix + prefix = prefix if prefix else self.text_line_prefix for rule in rules: setattr( self, @@ -60,30 +60,39 @@ def _set_rules( ) def _wrap_legacy_found( - self, method: Callable[[Tree], Transformer] - ) -> Callable[[Tree], None]: - def _method(tree: Tree) -> None: + self, method: Callable[[Tree], LegacyTransformer] + ) -> Callable[[Tree], Tree]: + def _method(tree: Tree) -> Tree: + # ToDo: Continue from here. Highest priority. + # Transformations happen, but the original tree won't change. transformer = method(tree) + tree = transformer.transform(tree) if transformer.legacy_found: self.legacy_found = True + print("\n!!!! tree", tree) + return tree return _method - def transform_legacy_sign(self, tree: Tree) -> None: - return AccentedIndexTransformer(visit_tokens=True).transform(tree) + def get_legacy_sign_transformer(self, tree: Tree) -> LegacyTransformer: + return AccentedIndexTransformer() - def transform_legacy_damage(self, tree: Tree) -> None: - return HalfBracketsTransformer().transform(tree) + def get_legacy_damage_transformer(self, tree: Tree) -> LegacyTransformer: + return HalfBracketsTransformer() - def transform_legacy_joiner(self, tree: Tree) -> None: - return OraccJoinerTransformer().transform(tree) + def get_legacy_joiner_transformer(self, tree: Tree) -> LegacyTransformer: + return OraccJoinerTransformer() - def transform_legacy_special(self, tree: Tree) -> None: - return OraccSpecialTransformer().transform(tree) + def get_legacy_special_transformer(self, tree: Tree) -> LegacyTransformer: + return OraccSpecialTransformer() class HalfBracketsTransformer(LegacyTransformer): # ToDo: Check if works + + def __init__(self): + self.open = False + @v_args(inline=True) def ebl_atf_text_line__LEGACY_OPEN_HALF_BRACKET(self, bracket: str) -> str: print("! bbbbbb", bracket) @@ -110,15 +119,20 @@ def ebl_atf_text_line__flags(self, flags: str): class OraccJoinerTransformer(LegacyTransformer): @v_args(inline=True) def ebl_atf_text_line__LEGACY_ORACC_JOINER(self, bracket: str) -> str: + print("!!!!!!!!!!!!!!!!!!!! LEGACY_ORACC_JOINER") self.legacy_found = True return "-" class OraccSpecialTransformer(LegacyTransformer): @v_args(inline=True) - def ebl_atf_text_line__LEGACY_ORACC_DISH_DIVIDER(self, bracket: str) -> str: + def ebl_atf_text_line__LEGACY_ORACC_DISH_DIVIDER(self, child: str) -> Tree: + print("!!!!!!!!!!!!!!!!!!!! LEGACY_ORACC_DISH_DIVIDER") self.legacy_found = True - return "DIŠ" + return Tree( + "ebl_atf_text_line__logogram_name_part", + [Token("ebl_atf_text_line__LOGOGRAM_CHARACTER", char) for char in "DIŠ"], + ) class AccentedIndexTransformer(LegacyTransformer): @@ -148,14 +162,17 @@ def __init__(self, **kwargs): @v_args(inline=True) def ebl_atf_text_line__LEGACY_VALUE_CHARACTER_ACCENTED(self, char: str) -> str: + print("!!!!!!!!!!!!!!!!!!!! LEGACY_VALUE_CHARACTER_ACCENTED") return self._transform_accented_vowel(char) @v_args(inline=True) def ebl_atf_text_line__LEGACY_LOGOGRAM_CHARACTER_ACCENTED(self, char: str) -> str: + print("!!!!!!!!!!!!!!!!!!!! LEGACY_LOGOGRAM_CHARACTER_ACCENTED") return self._transform_accented_vowel(char) @v_args(inline=True) def ebl_atf_text_line__sub_index(self, char: Optional[str]) -> Optional[str]: + print("!!!!!!!!!!!!!!!!!!!! ebl_atf_text_line__sub_index") return self.sub_index if self.sub_index and not char else char def _transform_accented_vowel(self, char: str) -> str: diff --git a/ebl/signs/infrastructure/mongo_sign_repository.py b/ebl/signs/infrastructure/mongo_sign_repository.py index 01ea2d967..d2b326794 100644 --- a/ebl/signs/infrastructure/mongo_sign_repository.py +++ b/ebl/signs/infrastructure/mongo_sign_repository.py @@ -16,9 +16,8 @@ Fossey, SortKeys, ) - from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema -from ebl.transliteration.domain.lark_parser import parse_atf_lark +from ebl.transliteration.domain.atf_parsers.lark_parser import parse_atf_lark COLLECTION = "signs" diff --git a/ebl/tests/atf_importer/test_atf_preprocessor.py b/ebl/tests/atf_importer/test_atf_preprocessor.py index 5b5a64744..a4639abe2 100644 --- a/ebl/tests/atf_importer/test_atf_preprocessor.py +++ b/ebl/tests/atf_importer/test_atf_preprocessor.py @@ -2,12 +2,11 @@ import json from ebl.atf_importer.domain.atf_preprocessor import AtfPreprocessor - PROBLEMATIC_TEXT_LINES = [ ( - "1. [*] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ ŠUB{" + "1. [*] AN#.GE₆ GAR-ma U₄ ŠÚ{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šú ŠUB{" "+di} * AN.GE₆", - "1. [ DIŠ] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} DIŠ AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ " + "1. [DIŠ] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} DIŠ AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ " "ŠUB{+di} DIŠ AN.GE₆", ), ( @@ -17,8 +16,8 @@ "ud-da-a-ta", ), ( - "14. [...] x (x) še-e-hu $BAD $E₂ $ME : ina GAŠAN-ia₅ {d}SUEN {" - "d}INANA--.AN.NA", + "14. [...] x (x) še-e-hu $BAD $É $ME : ina GAŠAN-ia₅ {d}SUEN {" + "d}INANA--<É>.AN.NA", "14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {" "d}INANA-.AN.NA", ), @@ -41,18 +40,23 @@ @pytest.mark.parametrize( - "line,expected", + "legacy_line,ebl_line", [*PROBLEMATIC_TEXT_LINES, FOLLOWING_SIGN_IS_NOT_A_LOGOGRAM, *LEGACY_GRAMMAR_SIGNS], ) -def test_text_lines(line, expected): +def test_text_lines(legacy_line, ebl_line): + # ToDo: fix atf_preprocessor = AtfPreprocessor("../logs", 0) - ( - converted_line, - c_array, - c_type, - c_alter_lem_line_at, - ) = atf_preprocessor.process_line(line) - assert converted_line == expected + legacy_tree = atf_preprocessor.ebl_parser.parse(legacy_line) + legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree) + + expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line) + expected_tree = atf_preprocessor.transform_legacy_atf(expected_tree) + + # (converted_line,) = atf_preprocessor.process_line(legacy_line) + print(legacy_tree) + print(expected_tree) + + assert legacy_tree == expected_tree lemma_lines = [] diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/legacy_atf.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/legacy_atf.lark index c924a7d90..ced076186 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/legacy_atf.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/legacy_atf.lark @@ -5,9 +5,9 @@ LEGACY_OPEN_HALF_BRACKET: "⌈" | "⸢" LEGACY_CLOSE_HALF_BRACKET: "⌉" | "⸣" LEGACY_ORACC_DISH_DIVIDER: "*" legacy_uncertain_sign_prefix: "$" - legacy_single_ruling: " "* "ruling" [" "? ("!?" | "*" | "?" | "!")] + //# ToDo: //# Implement here the following: //# oracc_atf_text_line__uncertain_sign