Skip to content

Commit

Permalink
Update, refactor & fix (WiP)
Browse files Browse the repository at this point in the history
  • Loading branch information
khoidt committed Oct 23, 2024
1 parent 4e4693f commit b4b8159
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 59 deletions.
4 changes: 2 additions & 2 deletions ebl/atf_importer/domain/atf_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(self):
self.alter_lem_line_at = []
self.removal_open = False

def ebl_atf_text_line__word(self, tree):
def ebl_atf_text_line__word(self, tree: Tree) -> None:
assert tree.data == "ebl_atf_text_line__word"
word = ""

Expand All @@ -91,7 +91,7 @@ def ebl_atf_text_line__word(self, tree):
self.removal_open = False
self.alter_lem_line_at.append(self.wordcounter)
elif isinstance(child, lexer.Token):
word += child
word += child # ToDo: Check, fix type error
else:
word += DepthFirstSearch().visit_topdown(child, "")

Expand Down
14 changes: 10 additions & 4 deletions ebl/atf_importer/domain/atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ebl.atf_importer.domain.atf_preprocessor_util import Util
from ebl.atf_importer.domain.atf_conversions import GetWords
from ebl.atf_importer.domain.legacy_atf_visitor import LegacyAtfVisitor
# from ebl.transliteration.domain.line_transformer import LineTransformer


class AtfPreprocessor(AtfPreprocessorBase):
Expand Down Expand Up @@ -52,13 +53,16 @@ def process_line(
return self.parse_and_convert_line(atf_line)

def check_original_line(self, atf: str) -> Tuple[str, List[Any], str, List[Any]]:
input(f"! check_original_line. [{atf}]")
print("! check_original_line.")
if self.style == 2 and atf[0] == "#" and atf[1] == " ":
atf = atf.replace("#", "#note:")
atf = atf.replace("# note:", "#note:")
input("! before parse")
# input(f"! before parse:\n{atf}")
tree = self.ebl_parser.parse(atf)
input("! before transform")
# print(tree.pretty())
# input(f"! after parse:\n{self.line_tree_to_string(tree)}")
# input("! before transform")
# input("! after transform")
tree = self.transform_legacy_atf(tree)
self.logger.info("Line successfully parsed")
self.logger.debug(f"Parsed line as {tree.data}")
Expand All @@ -81,7 +85,9 @@ def parse_and_convert_line(
try:
tree = self.ebl_parser.parse(atf)
if tree.data in self.unused_lines:
result = self.get_empty_conversion(tree)
# result = self.get_empty_conversion(tree)
# ToDo: Check original
return tree
elif tree.data == "lem_line":
result = self.convert_lem_line(atf, tree)
elif tree.data == "text_line":
Expand Down
20 changes: 7 additions & 13 deletions ebl/atf_importer/domain/atf_preprocessor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,6 @@ def __init__(self, logdir: str, style: int) -> None:
# Previously: "lark-oracc/oracc_atf.lark",
# This should be eventually removed completely.

# self.oracc_parser = Lark.open(
# "../../transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark",
# maybe_placeholders=True,
# rel_to=__file__,
# )

self.logger = logging.getLogger("Atf-Preprocessor")
self.logger.setLevel(logging.DEBUG)
self.skip_next_lem_line = False
Expand Down Expand Up @@ -169,13 +163,13 @@ def unused_line(
return (None, None, tree.data, None)

def convert_lem_line(
self, atf: str, tree
self, atf: str, tree: Tree
) -> Tuple[Optional[str], Optional[List[Any]], str, Optional[List[Any]]]:
if self.skip_next_lem_line:
self.logger.warning("Skipping lem line due to previous flag.")
self.skip_next_lem_line = False
return (None, None, "lem_line", None)
lemmas_and_guidewords_array = self.serizalize_lemmas_and_guidewords()
lemmas_and_guidewords_array = self.serizalize_lemmas_and_guidewords(tree)
self.logger.debug(
"Converted line as "
+ tree.data
Expand All @@ -186,6 +180,7 @@ def convert_lem_line(
return atf, lemmas_and_guidewords_array, tree.data, []

def line_tree_to_string(self, tree: Tree) -> str:
# ToDo: Remove
line_serializer = LineSerializer()
line_serializer.visit_topdown(tree)
return line_serializer.line.strip(" ")
Expand Down Expand Up @@ -240,11 +235,9 @@ def _replace_dashes(self, atf: str) -> str:
return re.sub(r"–|--", "-", atf)

def _normalize_patterns(self, atf: str) -> str:
callback_normalize = (
lambda pat: pat.group(1)
+ pat.group(2)
+ self._normalize_numbers(pat.group(3))
)
def callback_normalize(pat):
return pat.group(1) + pat.group(2) + self._normalize_numbers(pat.group(3))

return re.sub(r"(.*?)([a-zA-Z])(\d+)", callback_normalize, atf)

def _replace_primed_digits(self, atf: str) -> str:
Expand Down Expand Up @@ -283,6 +276,7 @@ def _handle_dollar_line(self, atf: str) -> str:
def _process_bracketed_parts(self, atf: str) -> str:
self.open_found = False
split = re.split(r"([⌈⌉⸢⸣])", atf)
# ToDo: Remove:
if len(split) > 1 and atf.startswith("9. ⸢4(BÁN)?⸣"):
# ToDo: Continue from here.
# Problem with `4(BÁN)#?`, which is not in lark grammer
Expand Down
63 changes: 40 additions & 23 deletions ebl/atf_importer/domain/legacy_atf_visitor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from typing import Optional, Sequence, Callable
from lark.visitors import Visitor, Transformer, Tree, v_args
from lark.visitors import Visitor, Transformer, Tree, Token, v_args

# ToDo: Continue from here
# Make sure every transformer is implemented and works properly.
Expand All @@ -25,33 +25,33 @@ class LegacyAtfVisitor(Visitor):
sign_rules = ["number", "reading", "logogram", "surrogate", "GRAPHEME_NAME"]
legacy_damage_rules = ["_parts_pattern", "_parts_pattern_gloss"]
legacy_joiner_rulers = ["LEGACY_ORACC_JOINER"]
legacy_special_rulers = ["LEGACY_ORACC_DISH_SIGN"]
legacy_special_rulers = ["logogram"]

def __init__(self):
super().__init__()
self.legacy_found = False
self._set_rules(self.sign_rules, self.transform_legacy_sign)
self._set_rules(self.sign_rules, self.get_legacy_sign_transformer)
self._set_rules(
self.legacy_damage_rules,
self.transform_legacy_damage,
self.get_legacy_damage_transformer,
)
self._set_rules(
self.legacy_joiner_rulers,
self.transform_legacy_joiner,
self.get_legacy_joiner_transformer,
)
self._set_rules(
self.legacy_divider_rulers,
self.transform_legacy_special,
self.legacy_special_rulers,
self.get_legacy_special_transformer,
)
input("legacy visitor initiated")

def _set_rules(
self,
rules: Sequence[str],
method: Callable[[Tree], Transformer],
prefix: Optional[str],
method: Callable[[Tree], LegacyTransformer],
prefix: Optional[str] = None,
) -> None:
prefix = perfix if prefix else self.text_line_prefix
prefix = prefix if prefix else self.text_line_prefix
for rule in rules:
setattr(
self,
Expand All @@ -60,30 +60,39 @@ def _set_rules(
)

def _wrap_legacy_found(
self, method: Callable[[Tree], Transformer]
) -> Callable[[Tree], None]:
def _method(tree: Tree) -> None:
self, method: Callable[[Tree], LegacyTransformer]
) -> Callable[[Tree], Tree]:
def _method(tree: Tree) -> Tree:
# ToDo: Continue from here. Highest priority.
# Transformations happen, but the original tree won't change.
transformer = method(tree)
tree = transformer.transform(tree)
if transformer.legacy_found:
self.legacy_found = True
print("\n!!!! tree", tree)
return tree

return _method

def transform_legacy_sign(self, tree: Tree) -> None:
return AccentedIndexTransformer(visit_tokens=True).transform(tree)
def get_legacy_sign_transformer(self, tree: Tree) -> LegacyTransformer:
return AccentedIndexTransformer()

def transform_legacy_damage(self, tree: Tree) -> None:
return HalfBracketsTransformer().transform(tree)
def get_legacy_damage_transformer(self, tree: Tree) -> LegacyTransformer:
return HalfBracketsTransformer()

def transform_legacy_joiner(self, tree: Tree) -> None:
return OraccJoinerTransformer().transform(tree)
def get_legacy_joiner_transformer(self, tree: Tree) -> LegacyTransformer:
return OraccJoinerTransformer()

def transform_legacy_special(self, tree: Tree) -> None:
return OraccSpecialTransformer().transform(tree)
def get_legacy_special_transformer(self, tree: Tree) -> LegacyTransformer:
return OraccSpecialTransformer()


class HalfBracketsTransformer(LegacyTransformer):
# ToDo: Check if works

def __init__(self):
self.open = False

@v_args(inline=True)
def ebl_atf_text_line__LEGACY_OPEN_HALF_BRACKET(self, bracket: str) -> str:
print("! bbbbbb", bracket)
Expand All @@ -110,15 +119,20 @@ def ebl_atf_text_line__flags(self, flags: str):
class OraccJoinerTransformer(LegacyTransformer):
@v_args(inline=True)
def ebl_atf_text_line__LEGACY_ORACC_JOINER(self, bracket: str) -> str:
print("!!!!!!!!!!!!!!!!!!!! LEGACY_ORACC_JOINER")
self.legacy_found = True
return "-"


class OraccSpecialTransformer(LegacyTransformer):
@v_args(inline=True)
def ebl_atf_text_line__LEGACY_ORACC_DISH_DIVIDER(self, bracket: str) -> str:
def ebl_atf_text_line__LEGACY_ORACC_DISH_DIVIDER(self, child: str) -> Tree:
print("!!!!!!!!!!!!!!!!!!!! LEGACY_ORACC_DISH_DIVIDER")
self.legacy_found = True
return "DIŠ"
return Tree(
"ebl_atf_text_line__logogram_name_part",
[Token("ebl_atf_text_line__LOGOGRAM_CHARACTER", char) for char in "DIŠ"],
)


class AccentedIndexTransformer(LegacyTransformer):
Expand Down Expand Up @@ -148,14 +162,17 @@ def __init__(self, **kwargs):

@v_args(inline=True)
def ebl_atf_text_line__LEGACY_VALUE_CHARACTER_ACCENTED(self, char: str) -> str:
print("!!!!!!!!!!!!!!!!!!!! LEGACY_VALUE_CHARACTER_ACCENTED")
return self._transform_accented_vowel(char)

@v_args(inline=True)
def ebl_atf_text_line__LEGACY_LOGOGRAM_CHARACTER_ACCENTED(self, char: str) -> str:
print("!!!!!!!!!!!!!!!!!!!! LEGACY_LOGOGRAM_CHARACTER_ACCENTED")
return self._transform_accented_vowel(char)

@v_args(inline=True)
def ebl_atf_text_line__sub_index(self, char: Optional[str]) -> Optional[str]:
print("!!!!!!!!!!!!!!!!!!!! ebl_atf_text_line__sub_index")
return self.sub_index if self.sub_index and not char else char

def _transform_accented_vowel(self, char: str) -> str:
Expand Down
3 changes: 1 addition & 2 deletions ebl/signs/infrastructure/mongo_sign_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
Fossey,
SortKeys,
)

from ebl.transliteration.application.museum_number_schema import MuseumNumberSchema
from ebl.transliteration.domain.lark_parser import parse_atf_lark
from ebl.transliteration.domain.atf_parsers.lark_parser import parse_atf_lark

COLLECTION = "signs"

Expand Down
32 changes: 18 additions & 14 deletions ebl/tests/atf_importer/test_atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
import json
from ebl.atf_importer.domain.atf_preprocessor import AtfPreprocessor


PROBLEMATIC_TEXT_LINES = [
(
"1. [*] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ ŠUB{"
"1. [*] AN#.GE₆ GAR-ma U₄ ŠÚ{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šú ŠUB{"
"+di} * AN.GE₆",
"1. [ DIŠ] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} DIŠ AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ "
"1. [DIŠ] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} DIŠ AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ "
"ŠUB{+di} DIŠ AN.GE₆",
),
(
Expand All @@ -17,8 +16,8 @@
"ud-da-a-ta",
),
(
"14. [...] x (x) še-e-hu $BAD $E₂ $ME : ina GAŠAN-ia₅ {d}SUEN {"
"d}INANA--<E₂>.AN.NA",
"14. [...] x (x) še-e-hu $BAD $É $ME : ina GAŠAN-ia₅ {d}SUEN {"
"d}INANA--<É>.AN.NA",
"14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {"
"d}INANA-<E₂>.AN.NA",
),
Expand All @@ -41,18 +40,23 @@


@pytest.mark.parametrize(
"line,expected",
"legacy_line,ebl_line",
[*PROBLEMATIC_TEXT_LINES, FOLLOWING_SIGN_IS_NOT_A_LOGOGRAM, *LEGACY_GRAMMAR_SIGNS],
)
def test_text_lines(line, expected):
def test_text_lines(legacy_line, ebl_line):
# ToDo: fix
atf_preprocessor = AtfPreprocessor("../logs", 0)
(
converted_line,
c_array,
c_type,
c_alter_lem_line_at,
) = atf_preprocessor.process_line(line)
assert converted_line == expected
legacy_tree = atf_preprocessor.ebl_parser.parse(legacy_line)
legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree)

expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line)
expected_tree = atf_preprocessor.transform_legacy_atf(expected_tree)

# (converted_line,) = atf_preprocessor.process_line(legacy_line)
print(legacy_tree)
print(expected_tree)

assert legacy_tree == expected_tree


lemma_lines = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ LEGACY_OPEN_HALF_BRACKET: "⌈" | "⸢"
LEGACY_CLOSE_HALF_BRACKET: "" | ""
LEGACY_ORACC_DISH_DIVIDER: "*"
legacy_uncertain_sign_prefix: "$"

legacy_single_ruling: " "* "ruling" [" "? ("!?" | "*" | "?" | "!")]


//# ToDo:
//# Implement here the following:
//# oracc_atf_text_line__uncertain_sign
Expand Down

0 comments on commit b4b8159

Please sign in to comment.