Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chinese & Uighur fixes. #140

Merged
merged 24 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e38dafa
Add personal name handling in Chinese.
scossu Oct 13, 2024
2d43865
Merge pull request #139 from lcnetdev/chinese_names
scossu Oct 13, 2024
7405119
Fix variable name in Chinese hook.
scossu Oct 13, 2024
4e955ca
Fix one more variable.
scossu Oct 13, 2024
3510651
Add case insensitive flag to Uighur (Arabic).
scossu Oct 19, 2024
408e57a
WIP ignore by regular expression.
scossu Oct 22, 2024
e2f0d2b
Temp fix for some Uighur characters.
scossu Oct 22, 2024
309d965
Separate Thai words.
scossu Oct 22, 2024
237f1f8
Use model name shorthand.
scossu Oct 22, 2024
ac29135
Merge pull request #141 from lcnetdev/thai_word_splitting
scossu Oct 22, 2024
3e35029
Preload Thai LLM.
scossu Oct 22, 2024
a71d02a
Merge pull request #142 from lcnetdev/thai_word_splitting
scossu Oct 22, 2024
67a359f
Move preloading.
scossu Oct 22, 2024
2f54bf9
Merge pull request #143 from lcnetdev/thai_word_splitting
scossu Oct 22, 2024
e730c9e
Make mail from and to env vars conditional.
scossu Oct 22, 2024
c22fc69
Merge branch 'thai_word_splitting' into test
scossu Oct 22, 2024
22cf1ea
Merge branch 'test' of gh:lcnetdev/scriptshifter into re
scossu Nov 1, 2024
6c5cab4
WIP regexp and testing framework.
scossu Nov 16, 2024
90b9f4c
Do unit test properly.
scossu Nov 17, 2024
7e0722d
Pass minimum test set.
scossu Nov 18, 2024
efb27b8
WIP Add tests for regex ignore patterns.
scossu Nov 25, 2024
91f3ad9
Complete basic tests for regex ignore.
scossu Dec 3, 2024
fbfdec4
Remove Aksharamukha's Thai from menu; rename thai_alt to thai.
scossu Dec 3, 2024
9b98c2e
Merge branch 're' into test
scossu Dec 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ COPY tests ./tests/
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

ENV HF_DATASETS_CACHE /data/hf/datasets
RUN ./sscli admin init-db

RUN chmod +x ./entrypoint.sh
#RUN chown -R www:www ${WORKROOT} .

Expand Down
2 changes: 1 addition & 1 deletion doc/rest_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ MIME type: `application/json`

Content: JSON object with the following keys:

- `lang`: Language code as given by the `/languages` endpoint.
- `lang`: Language code as given by the `/languages` endpoint.
- `text`: Input text to be transliterated.
- `capitalize`: One of `first` (capitalize the first letter of the input),
`all` (capitalize all words separated by spaces), or null (default: apply no
Expand Down
5 changes: 3 additions & 2 deletions entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ else
export FLASK_ENV="production"
fi

# Preload Thai model.
python -c 'from esupar import load; load("th")'

host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
port=${TXL_WEBAPP_PORT:-"8000"}

./sscli admin init-db

if [ "${FLASK_ENV}" == "development" ]; then
exec flask run -h $host -p $port
else
Expand Down
2 changes: 2 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ FLASK_DEBUG=true
TXL_DICTA_EP="changeme"
TXL_FLASK_SECRET="changeme"
TXL_LOGLEVEL="INFO"
TXL_EMAIL_FROM="[email protected]"
TXL_EMAIL_TO="[email protected]"
LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
144 changes: 144 additions & 0 deletions legacy/processNumbers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
private processNumbers(pinyinString: string, tag: string, code: string): string {
let outputString = "";
let useNumVersion = false;
//useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
if ((tag == "245" || tag == "830") && code == "n") {
useNumVersion = true;
}

/*
* The input string is split, with any space or punctuation character (except for #) as the delimiter.
* The delimiters will be captured and included in the string of tokens. Only the even-numbered
* array elements are the true 'tokens', so the code for processing tokens is run only for even
* values of j.
*/
let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
let numToken_re = new RegExp(numTokenPattern);
let n = tokens.length
//this.alert.info(tokens.join("|"),{autoClose: false})
for (let i = 0; i < n; i++) {
let toki = tokens[i];
if (toki.match(numToken_re)) {
/*
* When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
* found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the
* # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a
* non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
* which version should be used in the output string. The outer loop then continues where the inner loop left off.
*/
let textVersion = "";
let numVersion = "";
for (let j = i; j < n; j++) {
let tokj = tokens[j];
/* a token without # (or the end of string) is reached */
if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
//If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
let m = tokj.match(numToken_re);
if (m) {
textVersion += m[1]
if (m[2] == "") {
numVersion += m[1];
} else {
numVersion += m[2];
}
} else if (j == n - 1) {
//if last token is non-numerical, just tack it on.
textVersion += tokj;
numVersion += tokj;
} else if (textVersion.length > 0 && numVersion.length > 0) {
//if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
//(outer loop will pick up at this point)
textVersion = textVersion.substring(0, textVersion.length - 1);
numVersion = numVersion.substring(0, numVersion.length - 1);
}
//evaluate numerical string that has been constructed so far
//use num version for ordinals and date strings
if (numVersion.match(/^di [0-9]/i) ||
numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
useNumVersion
) {
useNumVersion = true;
/*
* At this point, string may contain literal translations of Chinese numerals
* Convert these to Arabic numerals (for example "2 10 7" = "27").
*/

while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
m = numVersion.match(/([0-9]+) ([1-9]0+)/);
if (m) {
let sum = Number(m[1]) * Number(m[2]);
numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
} else {
let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
if (mb)
{
let sumb = Number(mb[1]) + Number(mb[2]);
numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
}
else
{
break;
}
}
}

//A few other tweaks
numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
if ((tag == "245" || tag == "830") && code == "n") {
while (numVersion.match(/[0-9] [0-9]/)) {
numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
}
}
}
if (useNumVersion)
{
outputString += numVersion;
}
else
{
outputString += textVersion;
}
//if the end of the string is not reached, backtrack to the delimiter after the last numerical token
//(i.e. two tokens ago)
if (j < n - 1)
{
i = j - 2;
}
else //we are at the end of the string, so we are done!
{
i = j;
}
break;
}
//this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
//This is identical to the code that is run above when the last token is numeric.
if (j % 2 == 0)
{
let m = tokj.match(numToken_re);
textVersion += m[1];
if (m[2]== "")
{
numVersion += m[1];
}
else
{
numVersion += m[2];
}
}
else //a delimiter, just tack it on.
{
textVersion += tokj;
numVersion += tokj;
}
}
}
else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
{
outputString += toki;
}
}
return outputString;
}
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Core application dependencies.
aksharamukha>=2.2,<3
esupar>=1.7.5
flask>=2.3,<3
flask-cors>=4.0,<5
python-dotenv>=1.0,<2
Expand Down
8 changes: 5 additions & 3 deletions scriptshifter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
This DB stores all the runtime transliteration data.
"""
DB_PATH = environ.get(
"DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
"TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))

"""
SMTP server for sending email. For a dummy server that just echoes the
Expand Down Expand Up @@ -50,8 +50,10 @@
if not env:
logger.warn("No .env file found. Assuming env was passed externally.")

EMAIL_FROM = environ["TXL_EMAIL_FROM"]
EMAIL_TO = environ["TXL_EMAIL_TO"]
if SMTP_HOST or FEEDBACK_PATH:
EMAIL_FROM = environ["TXL_EMAIL_FROM"]
EMAIL_TO = environ["TXL_EMAIL_TO"]

try:
SMTP_PORT = int(environ.get("TXL_SMTP_PORT", "1025"))
except ValueError:
Expand Down
8 changes: 8 additions & 0 deletions scriptshifter/hooks/asian_tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from esupar import load


def s2r_tokenize(ctx, model):
nlp = load(model)
token_data = nlp(ctx.src)

ctx._src = " ".join(token_data.values[1])
17 changes: 17 additions & 0 deletions scriptshifter/hooks/chinese/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,20 @@ def parse_numerals_pre_assembly(ctx):
# Skip main transliterate function joining.

return normalize_spacing_post_assembly(ctx)


def person_name_pre_assembly(ctx):
"""
Parse a personal name from a specific MARC field.
"""
if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
return

ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
if len(ctx.dest_ls) > 2:
ctx.dest_ls[1] = ctx.dest_ls[1].strip()
if ctx.dest_ls[2][0] in "aeiou":
ctx.dest_ls[1] += "'"
ctx.dest_ls[1] += ctx.dest_ls[2]
del(ctx.dest_ls[2])
Loading
Loading