lcnetdev · scossu · Dec 9, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 13, 2024
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -14,6 +14,9 @@ port=${TXL_WEBAPP_PORT:-"8000"}
 
 ./sscli admin init-db
 
+# Preload Thai model.
+python -c 'from esupar import load; load("th")'
+
 if [ "${FLASK_ENV}" == "development" ]; then
     exec flask run -h $host -p $port
 else

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 # Core application dependencies.
 aksharamukha>=2.2,<3
+esupar>=1.7.5
 flask>=2.3,<3
 flask-cors>=4.0,<5
 python-dotenv>=1.0,<2

diff --git a/scriptshifter/hooks/asian_tokenizer/__init__.py b/scriptshifter/hooks/asian_tokenizer/__init__.py
@@ -0,0 +1,8 @@
+from esupar import load
+
+
+def s2r_tokenize(ctx, model):
+    nlp = load(model)
+    token_data = nlp(ctx.src)
+
+    ctx._src = " ".join(token_data.values[1])
diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py
@@ -127,3 +127,20 @@ def parse_numerals_pre_assembly(ctx):
     # Skip main transliterate function joining.
 
     return normalize_spacing_post_assembly(ctx)
+
+
+def person_name_pre_assembly(ctx):
+    """
+    Parse a personal name from a specific MARC field.
+    """
+    if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
+        return
+
+    ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
+    ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
+    if len(ctx.dest_ls) > 2:
+        ctx.dest_ls[1] = ctx.dest_ls[1].strip()
+        if ctx.dest_ls[2][0] in "aeiou":
+            ctx.dest_ls[1] += "'"
+        ctx.dest_ls[1] += ctx.dest_ls[2]
+        del(ctx.dest_ls[2])
diff --git a/scriptshifter/tables/data/chinese.yml b/scriptshifter/tables/data/chinese.yml
@@ -31,6 +31,8 @@ script_to_roman:
     pre_assembly:
       -
         - chinese.parse_numerals_pre_assembly
+      -
+        - chinese.person_name_pre_assembly
 
   map:
     "〇": "ling#0 "

diff --git a/scriptshifter/tables/data/thai.yml b/scriptshifter/tables/data/thai.yml
@@ -33,6 +33,9 @@ options:
 script_to_roman:
   hooks:
     post_config:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
       -
         - aksharamukha.romanizer.s2r_post_config
         - src_script: "Thai"

diff --git a/scriptshifter/tables/data/thai_alt.yml b/scriptshifter/tables/data/thai_alt.yml
@@ -4,6 +4,11 @@ general:
   case_sensitive: false
 
 script_to_roman:
+  hooks:
+    post_normalize:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "th"
   map:
     # COMMON SPECIAL CHARACTERS
 

diff --git a/scriptshifter/tables/data/uighur_arabic.yml b/scriptshifter/tables/data/uighur_arabic.yml
@@ -1,5 +1,7 @@
+---
 general:
   name: Uighur (Arabic)
+  case_sensitive: false
 
 roman_to_script:
   map:
@@ -118,7 +120,7 @@ roman_to_script:
     "%zh": "\uFB8A"
     "zh": "\uFB8B"
     "%zh%": "\uFB8A"
-    
+
 script_to_roman:
   map:
     "\u0626\u0627": "a"
@@ -157,9 +159,9 @@ script_to_roman:
     "\uFEEB": "h"
     "\uFEEC": "h"
     "\u0640\u0629": "h"
-    "\uFEEA": "h"
+    "%\uFEEA": "h"
     "\u0629": "h"
-    "\u0647": "h"
+    "%\u0647%": "h"
     "\uFE8C": "i"
     "\uFBE8": "i"
     "\uFBE9": "i"

diff --git a/scriptshifter/trans.py b/scriptshifter/trans.py
@@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         if _run_hook("post_config", ctx) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
-        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
-
-        if _run_hook("post_normalize", ctx) == BREAK:
+        # _normalize_src returns the results of the post_normalize hook.
+        if _normalize_src(
+                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
+        logger.debug(f"Normalized source: {ctx.src}")
         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
 
         # Loop through source characters. The increment of each loop depends on
@@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             # token or exit the scanning loop altogether.
             hret = _run_hook("begin_input_token", ctx)
             if hret == BREAK:
-                logger.debug("Breaking text scanning from hook signal.")
+                Logger.debug("Breaking text scanning from hook signal.")
                 break
             if hret == CONT:
                 logger.debug("Skipping scanning iteration from hook signal.")
@@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def _normalize_src(ctx, norm_rules):
     """
     Normalize source text according to rules.
+
+    NOTE: this manipluates the protected source attribute so it may not
+    correspond to the originally provided source.
     """
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
-    logger.debug(f"Normalized source: {ctx.src}")
+
+    return _run_hook("post_normalize", ctx)
 
 
 def _is_bow(cur, ctx, word_boundary):