Skip to content

Commit

Permalink
Update transformers pipeline & tests (WiP)
Browse files Browse the repository at this point in the history
  • Loading branch information
khoidt committed Oct 29, 2024
1 parent 276cc63 commit 0a0e481
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 76 deletions.
3 changes: 1 addition & 2 deletions ebl/atf_importer/domain/atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,8 @@ def transform_legacy_atf(self, tree: Tree) -> Tree:
visitor = LegacyAtfVisitor()
visitor.visit(tree)
print('!!!! visitor.legacy_found', visitor.legacy_found)
if visitor.legacy_found==True:
if visitor.legacy_found:
self.logger.info("Legacy line successfully parsed")
print(tree.pretty)
return tree

def parse_and_convert_line(
Expand Down
123 changes: 81 additions & 42 deletions ebl/atf_importer/domain/legacy_atf_visitor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import Optional, Sequence, Callable
from lark.visitors import Visitor, Transformer, Tree, Token, v_args
from ebl.transliteration.domain.atf import _SUB_SCRIPT

# ToDo: Continue from here
# Make sure every transformer is implemented and works properly.
Expand All @@ -16,13 +17,20 @@ def __init__(self):
super().__init__()
self.legacy_found = False

def clear(self):
self.legacy_found = False


class HalfBracketsTransformer(LegacyTransformer):
# ToDo: Check if works

def __init__(self):
self.open = False

def clear(self):
self.legacy_found = False
self.open = False

@v_args(inline=True)
def ebl_atf_text_line__LEGACY_OPEN_HALF_BRACKET(self, bracket: str) -> str:
print("! bbbbbb", bracket)
Expand Down Expand Up @@ -84,72 +92,107 @@ class AccentedIndexTransformer(LegacyTransformer):
"Ì": "I",
"Ù": "U",
}
patterns = ((re.compile("[áéíúÁÉÍÚ]"), "₂"), (re.compile("[àèìùÀÈÌÙ]"), "₃"))
accented_index_patterns = (
(re.compile("[áéíúÁÉÍÚ]"), "₂"),
(re.compile("[àèìùÀÈÌÙ]"), "₃"),
)

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.sub_index = None

def clear(self):
self.legacy_found = False
self.sub_index = None

@v_args(inline=True)
def ebl_atf_text_line__VALUE_CHARACTER(self, char: str) -> str:
if char in self.replacement_chars.keys():
print("!!!!!!!!!!!!!!!!!!!! LEGACY_VALUE_CHARACTER", char)
return self._transform_accented_vowel(char)
return char

@v_args(inline=True)
def ebl_atf_text_line__LOGOGRAM_CHARACTER(self, char: str) -> str:
if char in self.replacement_chars.keys():
print("!!!!!!!!!!!!!!!!!!!! LEGACY_LOGOGRAM_CHARACTER", char)
return self._transform_accented_vowel(char)
return char

@v_args(inline=True)
def ebl_atf_text_line__sub_index(self, char: Optional[str]) -> Optional[str]:
print("!!!!!!!!!!!!!!!!!!!! ebl_atf_text_line__sub_index")
return self.sub_index if self.sub_index and not char else char
def ebl_atf_text_line__sub_index(self, sub_index: Optional[str]) -> Optional[str]:
if sub_index and sub_index[0] in _SUB_SCRIPT.keys():
self.legacy_found = True
self._set_sub_index("".join(_SUB_SCRIPT[digit] for digit in sub_index))
return self.sub_index if self.sub_index else sub_index

def _transform_accented_vowel(self, char: str) -> str:
self._set_sub_index(char)
self._set_sub_index_from_accented(char)
self.legacy_found = True
return self.replacement_chars[char]

def _set_sub_index(self, char: str) -> None:
for pattern, suffix in self.patterns:
def _set_sub_index_from_accented(self, char: str) -> None:
for pattern, sub_index in self.accented_index_patterns:
if pattern.search(char):
self.sub_index = suffix
self._set_sub_index(sub_index)
break

def _set_sub_index(self, sub_index: str) -> None:
self.sub_index = Tree(
"ebl_atf_text_line__sub_index",
[Token("ebl_atf_text_line__SUB_INDEX", sub_index)],
)


accented_index_transformer = AccentedIndexTransformer()
half_brackets_transformer = HalfBracketsTransformer()
oracc_joiner_transformer = OraccJoinerTransformer()
oracc_special_transformer = OraccSpecialTransformer()
index_and_accented_transformer = (AccentedIndexTransformer(), "all_children")
half_brackets_transformer = (HalfBracketsTransformer(), "first_child")
oracc_joiner_transformer = (OraccJoinerTransformer(), "first_child")
oracc_special_transformer = (OraccSpecialTransformer(), "first_child")


class LegacyAtfVisitor(Visitor):
# ToDo: Continue from here.
# Move all atf preprocessing here
# ?Try to convert to string and then parse?
text_line_prefix = "ebl_atf_text_line"
tokens_to_visit = {
"number": [accented_index_transformer],
"reading": [accented_index_transformer],
"logogram": [accented_index_transformer, oracc_special_transformer],
"surrogate": [accented_index_transformer],
"GRAPHEME_NAME": [accented_index_transformer],
"number": [index_and_accented_transformer],
"reading": [index_and_accented_transformer],
"logogram": [index_and_accented_transformer, oracc_special_transformer],
"surrogate": [index_and_accented_transformer],
"grapheme": [index_and_accented_transformer],
"_parts_pattern": [half_brackets_transformer],
"_parts_pattern_gloss": [half_brackets_transformer],
"LEGACY_ORACC_JOINER": [oracc_joiner_transformer],
}

# ToDo: Fix nested `sign_index` within sign, as in `reading`.
"""
ebl_atf_text_line__word
ebl_atf_text_line__surrogate <-- ! Main parent
ebl_atf_text_line__logogram_name
ebl_atf_text_line__logogram_name_part
Š
U
ebl_atf_text_line__sub_index ₂ <-- ! The expected subindex
ebl_atf_text_line__modifiers
ebl_atf_text_line__flags
ebl_atf_text_line__surrogate_text
ebl_atf_text_line__reading
ebl_atf_text_line__value_name
ebl_atf_text_line__value_name_part
š
u
m
m
a
ebl_atf_text_line__sub_index ₂ <-- ! Problem here. Deeply nested second `sub_index`
ebl_atf_text_line__modifiers
ebl_atf_text_line__flags
None
"""

def __init__(self):
super().__init__()
self.legacy_found = False
for suffix, transformers in self.tokens_to_visit.items():
self._set_rules(suffix, transformers)

input("legacy visitor initiated")
input("LegacyAtfVisitor initiated")

def _set_rules(
self,
Expand All @@ -161,28 +204,24 @@ def _set_rules(
setattr(
self,
f"{prefix}__{suffix}",
self._wrap_legacy_found(transformers),
self._wrap_transformers(transformers),
)

def _wrap_legacy_found(
def _wrap_transformers(
self, transformers: Sequence[LegacyTransformer]
) -> Callable[[Tree], None]:
def _method(tree: Tree) -> Tree:
for transformer in transformers:
# ToDo: Continue from here. Top Priority.
# There is an error that likely has to do with
# the token (`tree`) element being added children
# disregarding the internal structure.
# A possible approach for complex transformers (such as `AccentedIndexTransformer`)
# might be saving the element as an attibute
# of the `LegacyTransformer` class, then extracting it, e.g.:
# transformer.transform(tree)
# tree.children[0] = transformer.result
# Make sure, however, that old results are not memorized:
# Either initiate new instances or (better?) renew them on each run.
tree.children[0] = transformer.transform(tree)
if transformer.legacy_found:
self.legacy_found = True
print("\nTransformed Tree:", tree.pretty())
for transformer, replace in transformers:
self._transform(tree, transformer, replace)

return _method

def _transform(self, tree: Tree, transformer: LegacyTransformer, replace: str):
transformer.clear()
transformed = transformer.transform(tree)
if transformer.legacy_found:
self.legacy_found = True
if replace == "first_child":
tree.children[0] = transformed.children[0]
elif replace == "all_children":
tree.children = transformed.children
18 changes: 10 additions & 8 deletions ebl/tests/atf_importer/test_atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

PROBLEMATIC_TEXT_LINES = [
(
"1. ŠÚ",
"1. ŠU₂",
"1. ŠÚ ù ŠÚ<(šumma)> |ŠÚ+ŠÚ|",
"1. ŠU₂ u₃ ŠU₂<(šumma)> |ŠU₂+ŠU₂|",
),
(
"1. [*]",
"1. [DIŠ]",
"1. [*] * *-*",
"1. [DIŠ] DIŠ DIŠ-DIŠ",
),
(
"1. ŠU2 u3 ŠU2<(šumma)> |ŠU2+ŠU2|",
"1. ŠU₂ u₃ ŠU₂<(šumma)> |ŠU₂+ŠU₂|",
),
(
"1. [*] AN#.GE₆ GAR-ma U₄ ŠÚ{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šú ŠUB{"
Expand Down Expand Up @@ -58,11 +62,9 @@ def test_text_lines(legacy_line, ebl_line):
legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree)

expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line)
expected_tree = atf_preprocessor.transform_legacy_atf(expected_tree)

# (converted_line,) = atf_preprocessor.process_line(legacy_line)
# print(legacy_tree)
# print(expected_tree)
print('RESULT:\n', legacy_tree.pretty())
print('EXPECTED:\n', expected_tree.pretty())

assert legacy_tree == expected_tree

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,17 @@ LOGOGRAM_CHARACTER_MAIN: "A" | "Ā" | "Â" | "B" | "D" | "E"
| "W" | "Z" | "" | "ʾ"

VALUE_CHARACTER: VALUE_CHARACTER_MAIN | LEGACY_VALUE_CHARACTER_ACCENTED
LOGOGRAM_CHARACTER: LOGOGRAM_CHARACTER_MAIN | LEGACY_LOGOGRAM_CHARACTER_ACCENTED
LOGOGRAM_CHARACTER: LOGOGRAM_CHARACTER_MAIN | LEGACY_LOGOGRAM_CHARACTER_ACCENTED

GREEK_ALPHABET: "Α" | "α" | "Β" | "β" | "Γ" | "γ" | "Δ" | "δ" | "Ε" | "ε"
| "Ζ" | "ζ" | "Η" | "η" | "Θ" | "θ" | "Ι" | "ι" | "Κ" | "κ"
| "Λ" | "λ" | "Μ" | "μ" | "Ν" | "ν" | "Ξ" | "ξ" | "Ο" | "ο"
| "Π" | "π" | "Ρ" | "ρ" | "Σ" | "σ" | "ς" | "Τ" | "τ" | "Υ"
| "υ" | "Φ" | "φ" | "Χ" | "χ" | "Ψ" | "ψ" | "Ω" | "ω"

AKKADIAN_ALPHABET: "ʾ" | "A" | "B" | "D" | "E" | "G" | "H" | "I" | "K" | "L"
| "M" | "N" | "P" | "S" | "T" | "U" | "Y" | "Z" | "a" | "b"
| "c" | "d" | "e" | "f" | "g" | "h" | "i" | "k" | "l" | "m"
| "n" | "p" | "q" | "r" | "s" | "t" | "u" | "w" | "y" | "z"
| "É" | "â" | "ê" | "î" | "û" | "ā" | "Ē" | "ē" | "ī" | "Š"
| "š" | "Ś" | "ś" | "ū" | "" | "" | ""
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
%import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge)
%import .ebl_atf_common (seal)
%import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER)
%import .ebl_atf_common (AKKADIAN_ALPHABET, GREEK_ALPHABET)
%import .ebl_atf_abbreviations (PERIOD)
%import .legacy_atf (LEGACY_OPEN_HALF_BRACKET, LEGACY_CLOSE_HALF_BRACKET)
%import .legacy_atf (LEGACY_ORACC_JOINER, LEGACY_ORACC_DISH_DIVIDER)
Expand Down Expand Up @@ -222,7 +223,6 @@ reading: value_name sub_index modifiers flags ["(" any_grapheme ")"]
logogram: logogram_name sub_index modifiers flags ["(" any_grapheme ")"]
surrogate: logogram_name sub_index modifiers flags "<(" surrogate_text ")>"
surrogate_text: reading (joiner reading)*
sub_index: [SUB_INDEX]

number_name: number_name_head (broken_away number_name_part)*
value_name: value_name_part (broken_away value_name_part)*
Expand All @@ -233,20 +233,20 @@ value_name_part: VALUE_CHARACTER+
logogram_name_part: LOGOGRAM_CHARACTER+

?any_grapheme: compound_grapheme | grapheme
compound_grapheme: "|" COMPOUND_FRAGMENT ("." COMPOUND_FRAGMENT)* "|"
COMPOUND_FRAGMENT: SUB_COMPOUND (COMPOUND_OPERATOR SUB_COMPOUND)*
SUB_COMPOUND: "(" COMPOUND_PART ((COMPOUND_OPERATOR) COMPOUND_PART)* ")"
| COMPOUND_PART
COMPOUND_PART: GRAPHEME (VARIANT_SEPARATOR GRAPHEME)*
compound_grapheme: "|" compound_fragment ("." compound_fragment)* "|"
?compound_fragment: sub_compound (COMPOUND_OPERATOR sub_compound)*
?sub_compound: "(" compound_part ((COMPOUND_OPERATOR) compound_part)* ")"
| compound_part
?compound_part: grapheme (VARIANT_SEPARATOR grapheme)*
COMPOUND_OPERATOR: "×" | "%" | "&" | "+" | "."

grapheme: GRAPHEME_NAME modifiers flags
GRAPHEME: GRAPHEME_NAME MODIFIER* FLAG
GRAPHEME_NAME: GRAPHEME_CHARACTER (GRAPHEME_CHARACTER)* SUB_INDEX?
GRAPHEME_CHARACTER: VALUE_CHARACTER | LOGOGRAM_CHARACTER | "0".."9"
grapheme: grapheme_name sub_index modifiers flags
grapheme_name: grapheme_name_part
grapheme_name_part: VALUE_CHARACTER+ | LOGOGRAM_CHARACTER+

sub_index: [SUB_INDEX]
SUB_INDEX: NUMERIC_SUB_INDEX | ""
NUMERIC_SUB_INDEX: "" | "".."" ("".."")*
NUMERIC_SUB_INDEX: "" | "".."" ("".."")* | "1".."9" ("0".."9")*

unidentified_sign: "X" flags
unclear_sign: "x" flags
Expand Down Expand Up @@ -310,12 +310,6 @@ open_emendation: "<"
close_emendation: ">"

akkadian_string: AKKADIAN_ALPHABET+
AKKADIAN_ALPHABET: "ʾ" | "A" | "B" | "D" | "E" | "G" | "H" | "I" | "K" | "L"
| "M" | "N" | "P" | "S" | "T" | "U" | "Y" | "Z" | "a" | "b"
| "c" | "d" | "e" | "f" | "g" | "h" | "i" | "k" | "l" | "m"
| "n" | "p" | "q" | "r" | "s" | "t" | "u" | "w" | "y" | "z"
| "É" | "â" | "ê" | "î" | "û" | "ā" | "Ē" | "ē" | "ī" | "Š"
| "š" | "Ś" | "ś" | "ū" | "" | "" | ""

_greek: greek_token (_WORD_SEPARATOR greek_token)*
?greek_token: greek_word
Expand All @@ -327,12 +321,6 @@ greek_word: (greek_enclosure | greek_word_part)* greek_word_part (greek_enclosur
| unknown_number_of_signs
?greek_enclosure: _any_open | _any_close
greek_letter: GREEK_ALPHABET flags
GREEK_ALPHABET: "Α" | "α" | "Β" | "β" | "Γ" | "γ" | "Δ" | "δ" | "Ε" | "ε"
| "Ζ" | "ζ" | "Η" | "η" | "Θ" | "θ" | "Ι" | "ι" | "Κ" | "κ"
| "Λ" | "λ" | "Μ" | "μ" | "Ν" | "ν" | "Ξ" | "ξ" | "Ο" | "ο"
| "Π" | "π" | "Ρ" | "ρ" | "Σ" | "σ" | "ς" | "Τ" | "τ" | "Υ"
| "υ" | "Φ" | "φ" | "Χ" | "χ" | "Ψ" | "ψ" | "Ω" | "ω"


labels: surface_label " " column_label
| surface_label
Expand Down

0 comments on commit 0a0e481

Please sign in to comment.