Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chinese & Uighur fixes. #140

Merged
merged 24 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e38dafa
Add personal name handling in Chinese.
scossu Oct 13, 2024
2d43865
Merge pull request #139 from lcnetdev/chinese_names
scossu Oct 13, 2024
7405119
Fix variable name in Chinese hook.
scossu Oct 13, 2024
4e955ca
Fix one more variable.
scossu Oct 13, 2024
3510651
Add case insensitive flag to Uighur (Arabic).
scossu Oct 19, 2024
408e57a
WIP ignore by regular expression.
scossu Oct 22, 2024
e2f0d2b
Temp fix for some Uighur characters.
scossu Oct 22, 2024
309d965
Separate Thai words.
scossu Oct 22, 2024
237f1f8
Use model name shorthand.
scossu Oct 22, 2024
ac29135
Merge pull request #141 from lcnetdev/thai_word_splitting
scossu Oct 22, 2024
3e35029
Preload Thai LLM.
scossu Oct 22, 2024
a71d02a
Merge pull request #142 from lcnetdev/thai_word_splitting
scossu Oct 22, 2024
67a359f
Move preloading.
scossu Oct 22, 2024
2f54bf9
Merge pull request #143 from lcnetdev/thai_word_splitting
scossu Oct 22, 2024
e730c9e
Make mail from and to env vars conditional.
scossu Oct 22, 2024
c22fc69
Merge branch 'thai_word_splitting' into test
scossu Oct 22, 2024
22cf1ea
Merge branch 'test' of gh:lcnetdev/scriptshifter into re
scossu Nov 1, 2024
6c5cab4
WIP regexp and testing framework.
scossu Nov 16, 2024
90b9f4c
Do unit test properly.
scossu Nov 17, 2024
7e0722d
Pass minimum test set.
scossu Nov 18, 2024
efb27b8
WIP Add tests for regex ignore patterns.
scossu Nov 25, 2024
91f3ad9
Complete basic tests for regex ignore.
scossu Dec 3, 2024
fbfdec4
Remove Aksharamukha's Thai from menu; rename thai_alt to thai.
scossu Dec 3, 2024
9b98c2e
Merge branch 're' into test
scossu Dec 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ port=${TXL_WEBAPP_PORT:-"8000"}

./sscli admin init-db

# Preload Thai model.
python -c 'from esupar import load; load("th")'

if [ "${FLASK_ENV}" == "development" ]; then
exec flask run -h $host -p $port
else
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Core application dependencies.
aksharamukha>=2.2,<3
esupar>=1.7.5
flask>=2.3,<3
flask-cors>=4.0,<5
python-dotenv>=1.0,<2
Expand Down
8 changes: 8 additions & 0 deletions scriptshifter/hooks/asian_tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from esupar import load


def s2r_tokenize(ctx, model):
nlp = load(model)
token_data = nlp(ctx.src)

ctx._src = " ".join(token_data.values[1])
17 changes: 17 additions & 0 deletions scriptshifter/hooks/chinese/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,20 @@ def parse_numerals_pre_assembly(ctx):
# Skip main transliterate function joining.

return normalize_spacing_post_assembly(ctx)


def person_name_pre_assembly(ctx):
"""
Parse a personal name from a specific MARC field.
"""
if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
return

ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
if len(ctx.dest_ls) > 2:
ctx.dest_ls[1] = ctx.dest_ls[1].strip()
if ctx.dest_ls[2][0] in "aeiou":
ctx.dest_ls[1] += "'"
ctx.dest_ls[1] += ctx.dest_ls[2]
del(ctx.dest_ls[2])
2 changes: 2 additions & 0 deletions scriptshifter/tables/data/chinese.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ script_to_roman:
pre_assembly:
-
- chinese.parse_numerals_pre_assembly
-
- chinese.person_name_pre_assembly

map:
"〇": "ling#0 "
Expand Down
3 changes: 3 additions & 0 deletions scriptshifter/tables/data/thai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ options:
script_to_roman:
hooks:
post_config:
-
- asian_tokenizer.s2r_tokenize
- model: "KoichiYasuoka/roberta-base-thai-spm-upos"
-
- aksharamukha.romanizer.s2r_post_config
- src_script: "Thai"
Expand Down
5 changes: 5 additions & 0 deletions scriptshifter/tables/data/thai_alt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ general:
case_sensitive: false

script_to_roman:
hooks:
post_normalize:
-
- asian_tokenizer.s2r_tokenize
- model: "th"
map:
# COMMON SPECIAL CHARACTERS

Expand Down
8 changes: 5 additions & 3 deletions scriptshifter/tables/data/uighur_arabic.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
---
general:
name: Uighur (Arabic)
case_sensitive: false

roman_to_script:
map:
Expand Down Expand Up @@ -118,7 +120,7 @@ roman_to_script:
"%zh": "\uFB8A"
"zh": "\uFB8B"
"%zh%": "\uFB8A"

script_to_roman:
map:
"\u0626\u0627": "a"
Expand Down Expand Up @@ -157,9 +159,9 @@ script_to_roman:
"\uFEEB": "h"
"\uFEEC": "h"
"\u0640\u0629": "h"
"\uFEEA": "h"
"%\uFEEA": "h"
"\u0629": "h"
"\u0647": "h"
"%\u0647%": "h"
"\uFE8C": "i"
"\uFBE8": "i"
"\uFBE9": "i"
Expand Down
15 changes: 10 additions & 5 deletions scriptshifter/trans.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
if _run_hook("post_config", ctx) == BREAK:
return getattr(ctx, "dest", ""), ctx.warnings

_normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))

if _run_hook("post_normalize", ctx) == BREAK:
# _normalize_src returns the results of the post_normalize hook.
if _normalize_src(
ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
return getattr(ctx, "dest", ""), ctx.warnings

logger.debug(f"Normalized source: {ctx.src}")
lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))

# Loop through source characters. The increment of each loop depends on
Expand All @@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
# token or exit the scanning loop altogether.
hret = _run_hook("begin_input_token", ctx)
if hret == BREAK:
logger.debug("Breaking text scanning from hook signal.")
Logger.debug("Breaking text scanning from hook signal.")
break
if hret == CONT:
logger.debug("Skipping scanning iteration from hook signal.")
Expand Down Expand Up @@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
def _normalize_src(ctx, norm_rules):
"""
Normalize source text according to rules.

NOTE: this manipluates the protected source attribute so it may not
correspond to the originally provided source.
"""
for nk, nv in norm_rules.items():
ctx._src = ctx.src.replace(nk, nv)
logger.debug(f"Normalized source: {ctx.src}")

return _run_hook("post_normalize", ctx)


def _is_bow(cur, ctx, word_boundary):
Expand Down