lcnetdev · scossu · Dec 9, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 13, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -9,6 +9,9 @@ COPY tests ./tests/
 COPY requirements.txt ./
 RUN pip install --no-cache-dir -r requirements.txt
 
+ENV HF_DATASETS_CACHE /data/hf/datasets
+RUN ./sscli admin init-db
+
 RUN chmod +x ./entrypoint.sh
 #RUN chown -R www:www ${WORKROOT} .
 

diff --git a/doc/rest_api.md b/doc/rest_api.md
@@ -73,7 +73,7 @@ MIME type: `application/json`
 
 Content: JSON object with the following keys:
 
-- `lang`: Language code as given by the `/languages` endpoint. 
+- `lang`: Language code as given by the `/languages` endpoint.
 - `text`: Input text to be transliterated.
 - `capitalize`: One of `first` (capitalize the first letter of the input),
   `all` (capitalize all words separated by spaces), or null (default: apply no

diff --git a/entrypoint.sh b/entrypoint.sh
@@ -9,11 +9,12 @@ else
     export FLASK_ENV="production"
 fi
 
+# Preload Thai model.
+python -c 'from esupar import load; load("th")'
+
 host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
 port=${TXL_WEBAPP_PORT:-"8000"}
 
-./sscli admin init-db
-
 if [ "${FLASK_ENV}" == "development" ]; then
     exec flask run -h $host -p $port
 else

diff --git a/example.env b/example.env
@@ -2,4 +2,6 @@ FLASK_DEBUG=true
 TXL_DICTA_EP="changeme"
 TXL_FLASK_SECRET="changeme"
 TXL_LOGLEVEL="INFO"
+TXL_EMAIL_FROM="[email protected]"
+TXL_EMAIL_TO="[email protected]"
 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
diff --git a/legacy/processNumbers.ts b/legacy/processNumbers.ts
@@ -0,0 +1,144 @@
+private processNumbers(pinyinString: string, tag: string, code: string): string {
+    let outputString = "";
+    let useNumVersion = false;
+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
+    if ((tag == "245" || tag == "830") && code == "n") {
+       useNumVersion = true;
+    }
+
+    /*
+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
+     * values of j.
+     */
+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
+    let numToken_re = new RegExp(numTokenPattern);
+    let n = tokens.length
+    //this.alert.info(tokens.join("|"),{autoClose: false})
+    for (let i = 0; i < n; i++) {
+        let toki = tokens[i];
+        if (toki.match(numToken_re)) {
+            /*
+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
+             */
+            let textVersion = "";
+            let numVersion = "";
+            for (let j = i; j < n; j++) {
+                let tokj = tokens[j];
+                /* a token without # (or the end of string) is reached */
+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
+                    let m = tokj.match(numToken_re);
+                    if (m) {
+                        textVersion += m[1]
+                        if (m[2] == "") {
+                            numVersion += m[1];
+                        } else {
+                            numVersion += m[2];
+                        }
+                    } else if (j == n - 1) {
+                    //if last token is non-numerical, just tack it on.
+                        textVersion += tokj;
+                        numVersion += tokj;
+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
+                    //(outer loop will pick up at this point)
+                        textVersion = textVersion.substring(0, textVersion.length - 1);
+                        numVersion = numVersion.substring(0, numVersion.length - 1);
+                    }
+                    //evaluate numerical string that has been constructed so far
+                    //use num version for ordinals and date strings
+                    if (numVersion.match(/^di [0-9]/i) ||
+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
+                        useNumVersion
+                       ) {
+                        useNumVersion = true;
+                        /*
+                         * At this point, string may contain literal translations of Chinese numerals
+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
+                         */
+
+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
+                            if (m) {
+                                let sum = Number(m[1]) * Number(m[2]);
+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
+                            } else {
+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
+                                if (mb)
+                                {
+                                    let sumb = Number(mb[1]) + Number(mb[2]);
+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        //A few other tweaks
+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
+                        if ((tag == "245" || tag == "830") && code == "n") {
+                            while (numVersion.match(/[0-9] [0-9]/)) {
+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
+                            }
+                        }
+                    }
+                    if (useNumVersion)
+                    {
+                        outputString += numVersion;
+                    }
+                    else
+                    {
+                        outputString += textVersion;
+                    }
+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
+                    //(i.e. two tokens ago)
+                    if (j < n - 1)
+                    {
+                        i = j - 2;
+                    }
+                    else //we are at the end of the string, so we are done!
+                    {
+                        i = j;
+                    }
+                    break;
+                }
+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
+                //This is identical to the code that is run above when the last token is numeric.
+                if (j % 2 == 0)
+                {
+                    let m = tokj.match(numToken_re);
+                    textVersion += m[1];
+                    if (m[2]== "")
+                    {
+                        numVersion += m[1];
+                    }
+                    else
+                    {
+                        numVersion += m[2];
+                    }
+                }
+                else //a delimiter, just tack it on.
+                {
+                    textVersion += tokj;
+                    numVersion += tokj;
+                }
+            }
+        }
+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
+        {
+            outputString += toki;
+        }
+    }
+    return outputString;
+ }
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 # Core application dependencies.
 aksharamukha>=2.2,<3
+esupar>=1.7.5
 flask>=2.3,<3
 flask-cors>=4.0,<5
 python-dotenv>=1.0,<2

diff --git a/scriptshifter/__init__.py b/scriptshifter/__init__.py
@@ -15,7 +15,7 @@
 This DB stores all the runtime transliteration data.
 """
 DB_PATH = environ.get(
-        "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
+        "TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
 
 """
 SMTP server for sending email. For a dummy server that just echoes the
@@ -50,8 +50,10 @@
 if not env:
     logger.warn("No .env file found. Assuming env was passed externally.")
 
-EMAIL_FROM = environ["TXL_EMAIL_FROM"]
-EMAIL_TO = environ["TXL_EMAIL_TO"]
+if SMTP_HOST or FEEDBACK_PATH:
+    EMAIL_FROM = environ["TXL_EMAIL_FROM"]
+    EMAIL_TO = environ["TXL_EMAIL_TO"]
+
 try:
     SMTP_PORT = int(environ.get("TXL_SMTP_PORT", "1025"))
 except ValueError:

diff --git a/scriptshifter/hooks/asian_tokenizer/__init__.py b/scriptshifter/hooks/asian_tokenizer/__init__.py
@@ -0,0 +1,8 @@
+from esupar import load
+
+
+def s2r_tokenize(ctx, model):
+    nlp = load(model)
+    token_data = nlp(ctx.src)
+
+    ctx._src = " ".join(token_data.values[1])
diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py
@@ -127,3 +127,20 @@ def parse_numerals_pre_assembly(ctx):
     # Skip main transliterate function joining.
 
     return normalize_spacing_post_assembly(ctx)
+
+
+def person_name_pre_assembly(ctx):
+    """
+    Parse a personal name from a specific MARC field.
+    """
+    if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
+        return
+
+    ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
+    ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
+    if len(ctx.dest_ls) > 2:
+        ctx.dest_ls[1] = ctx.dest_ls[1].strip()
+        if ctx.dest_ls[2][0] in "aeiou":
+            ctx.dest_ls[1] += "'"
+        ctx.dest_ls[1] += ctx.dest_ls[2]
+        del(ctx.dest_ls[2])