Merge branch 'main' of gh:lcnetdev/scriptshifter

lcnetdev · May 19, 2024 · 26c3513 · 26c3513
2 parents 5eccdce + 99dcaac
commit 26c3513
Show file tree

Hide file tree

Showing 38 changed files with 46,529 additions and 45,745 deletions.
diff --git a/.github/workflows/push-docker-image.yml → .github/workflows/push-app-image.yml b/.github/workflows/push-docker-image.yml → .github/workflows/push-app-image.yml
@@ -1,8 +1,15 @@
-name: Push image to Docker Hub.
+name: Push app image
 on:
+  # This runs on v *.*.0 after the base image has been
+  # built and pushed, or on patch version tag.
   push:
     tags:
-      - "v*.*.*"
+      - "v*.*.[1-9]*"
+  workflow_run:
+    workflows: 
+      - "Push base image"
+    types:
+      - "completed"
 
 env:
   DOCKER_USER: lcnetdev
@@ -13,13 +20,15 @@ jobs:
   push-image-to-docker-hub:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - name: checkout repo
+        uses: actions/checkout@v4
         with:
           submodules: recursive
 
       - name: Build the Docker image
         run: >
-          docker build . --tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
+          docker build -f Dockerfile .
+          --tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
           --tag $DOCKER_USER/$REPO_NAME:latest
 
       - name: Login to Docker Hub

diff --git a/.github/workflows/push-base-image.yml b/.github/workflows/push-base-image.yml
@@ -0,0 +1,46 @@
+name: Push base image
+on:
+  push:
+    tags:
+      - "v*.*.0"
+
+env:
+  DOCKER_USER: lcnetdev
+  DOCKER_PASSWORD: ${{secrets.DOCKER_HUB}}
+  REPO_NAME: scriptshifter-base
+
+jobs:
+  push-image-to-docker-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: checkout yiddish submodules (1/2)
+        uses: actions/checkout@v4
+        with:
+          repository: ibleaman/loshn-koydesh-pronunciation
+          path: ext/yiddish/yiddish/submodules/loshn-koydesh-pronunciation
+
+      - name: checkout yiddish submodules (2/2)
+        uses: actions/checkout@v4
+        with:
+          repository: ibleaman/hasidify_lexicon
+          path: ext/yiddish/yiddish/submodules/hasidify_lexicon
+
+      - name: Build the Docker image
+        run: >
+          docker build -f scriptshifter_base.Dockerfile .
+          --tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
+          --tag $DOCKER_USER/$REPO_NAME:latest
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: lcnetdev
+          password: ${{ secrets.DOCKER_HUB }}
+
+      - name: Push to Docker Hub
+        run: docker push $DOCKER_USER/$REPO_NAME --all-tags
diff --git a/.github/workflows/push-test-image.yml b/.github/workflows/push-test-image.yml
@@ -1,8 +1,8 @@
-name: Push test image to Docker Hub.
+name: Push test image
 on:
   push:
-    branch:
-      - "main"
+    branches:
+      - "test"
 
 env:
   DOCKER_USER: lcnetdev
@@ -13,12 +13,16 @@ jobs:
   push-image-to-docker-hub:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - name: checkout repo
+        uses: actions/checkout@v4
         with:
           submodules: recursive
 
       - name: Build the Docker image
-        run: docker build . --tag $DOCKER_USER/$REPO_NAME:test
+        run: >
+          docker build -f Dockerfile .
+          --tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
+          --tag $DOCKER_USER/$REPO_NAME:test
 
       - name: Login to Docker Hub
         uses: docker/login-action@v3

diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,7 @@
 [submodule "ext/arabic_rom"]
 	path = ext/arabic_rom
 	url = https://github.com/fadhleryani/Arabic_ALA-LC_Romanization.git
+[submodule "ext/yiddish"]
+	path = ext/yiddish
+	url = https://github.com/scossu/yiddish.git
+	branch = loc
diff --git a/Dockerfile b/Dockerfile
@@ -1,29 +1,15 @@
-FROM python:3.10-slim-bullseye
-
-RUN apt update
-RUN apt install -y build-essential tzdata gfortran libopenblas-dev libboost-all-dev
-
-ENV TZ=America/New_York
-ENV _workroot "/usr/local/scriptshifter/src"
-
-WORKDIR ${_workroot}
-COPY requirements.txt ./
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Remove development packages.
-RUN apt remove -y build-essential
-RUN apt autoremove -y
-
-RUN addgroup --system www
-RUN adduser --system www
-RUN gpasswd -a www www
+FROM lcnetdev/scriptshifter-base:latest
+ARG WORKROOT "/usr/local/scriptshifter/src"
 
+# Copy core application files.
+WORKDIR ${WORKROOT}
 COPY entrypoint.sh uwsgi.ini wsgi.py ./
-COPY ext ./ext/
 COPY scriptshifter ./scriptshifter/
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
 
 RUN chmod +x ./entrypoint.sh
-RUN chown -R www:www ${_workroot} .
+#RUN chown -R www:www ${WORKROOT} .
 
 EXPOSE 8000
 

diff --git a/deps.txt b/deps.txt
@@ -0,0 +1,7 @@
+# External dependencies.
+aksharamukha>=2.2,<3
+camel-tools>=1.5
+funcy>=1.15,<2
+pymarc>=4.0,<5
+repackage>=0.7.3
+./ext/yiddish
diff --git a/doc/hooks.md b/doc/hooks.md
@@ -333,7 +333,7 @@ and return it before any further default processing is done.
 
 #### Output
 
-`"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
+String or `None`. If a string, the transliteration function returns that
 immediately; otherwise it proceeds with standard adjustments of the output
 string before returning.
 

diff --git a/doc/rest_api.md b/doc/rest_api.md
@@ -69,6 +69,10 @@ Transliterate an input string into a given language.
 
 ### POST body
 
+MIME type: `application/json`
+
+Content: JSON object with the following keys:
+
 - `lang`: Language code as given by the `/languages` endpoint. 
 - `text`: Input text to be transliterated.
 - `capitalize`: One of `first` (capitalize the first letter of the input),
@@ -92,3 +96,22 @@ Content: JSON object containing two keys: `ouput` containing the transliterated
 string; and `warnings` containing a list of warnings. Characters not found in
 the mapping are copied verbatim in the transliterated string (see
 "Configuration files" section for more information).
+
+## `POST /feedback`
+
+Send a feedback form about a transliteration result.
+
+### POST body
+
+MIME type: `application/json`
+
+Content: JSON object with the following keys:
+
+    `lang`: language of the transliteration. Mandatory.
+    `src`: source text. Mandatory.
+    `t_dir`: transliteration direction. If omitted, it defaults to `s2r`.
+    `result`: result of the transliteration. Mandatory.
+    `expected`: expected result. Mandatory.
+    `options`: options passed to the request, if any.
+    `notes`: optional user notes.
+    `contact`: contact email for feedback. Optional.
diff --git a/ext/yiddish b/ext/yiddish
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,5 @@
-aksharamukha>=2.1,<3
-camel-tools>=1.5
+# Core application dependencies.
 flask>=2.3,<3
-funcy>=1.15,<2
-pymarc>=4.0,<5
 python-dotenv>=1.0,<2
 pyyaml>=6.0,<7
-repackage>=0.7.3
 uwsgi>=2.0,<2.1
diff --git a/scriptshifter/hooks/aksharamukha/romanizer.py b/scriptshifter/hooks/aksharamukha/romanizer.py
@@ -15,16 +15,22 @@
 logger = getLogger(__name__)
 
 
-def s2r_post_config(ctx, src_script):
+def s2r_post_config(ctx, src_script, pre=[], post=[]):
     # options = detect_preoptions(ctx.src, src_script)
-    options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
-    ctx.dest = process(src_script, "IAST", ctx.src, pre_options=options)
+    pre_options = pre + [
+            n for n, v in ctx.options.items() if v and n != "capitalize"]
+    ctx.dest = process(
+            src_script, "RomanLoC", ctx.src,
+            pre_options=pre_options, post_options=post)
 
     return BREAK
 
 
-def r2s_post_config(ctx, dest_script):
-    options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
-    ctx.dest = process("IAST", dest_script, ctx.src, post_options=options)
+def r2s_post_config(ctx, dest_script, pre=[], post=[]):
+    post_options = post + [
+            n for n, v in ctx.options.items() if v and n != "capitalize"]
+    ctx.dest = process(
+            "RomanLoC", dest_script, ctx.src,
+            pre_options=pre, post_options=post_options)
 
     return BREAK
diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py
@@ -0,0 +1,131 @@
+__doc__ = """Chinese hooks."""
+
+
+from logging import getLogger
+from re import I, compile, search, sub
+
+from scriptshifter.hooks.general import normalize_spacing_post_assembly
+
+
+logger = getLogger(__name__)
+
+
+def parse_numerals_pre_assembly(ctx):
+    """
+    Parse Chinese numerals in the already romanized result.
+
+    This is run at post-assembly.
+    """
+    # Only apply to specific MARC fields.
+    use_num_v = ctx.options.get("marc_field") in ("245n", "830n")
+
+    # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
+    tk_ct = len(ctx.dest_ls)
+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
+
+    output = ""
+
+    # Use manual loop as i is manipulated inside it.
+    i = 0
+
+    while i < tk_ct:
+        tk_i = ctx.dest_ls[i]
+        if search(token_ptn, tk_i):
+            # When a numerical token (containing #) is reached, the inner loop
+            # consumes it and all consecutive numerical tokens found after it.
+            # Two versions of the string are maintained. The textVersion is
+            # the original pinyin (minus the # suffixes). In the numVersion,
+            # characters representing numbers are converted to Arabic
+            # numerals. When a non-numerical token (or end of string) is
+            # encountered, the string of numerical tokens is evaluated to
+            # determine which version should be used in the output string.
+            # The outer loop then continues where the inner loop left off.
+            logger.debug(f"Match number: {tk_i}.")
+            text_v = num_v = ""
+            for j in range(i, tk_ct):
+                tk_j = ctx.dest_ls[j]
+                m = search(token_ptn, tk_j)
+                # if m:
+                #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
+                # a token without # (or the end of string) is reached
+                if not m or j == tk_ct - 1:
+                    logger.debug(f"Next token is not numeric: {tk_j}")
+                    # If this runs, then we are on the last token and it is
+                    # numeric. Add text after # (if present) to numerical
+                    # version and captured whitespace after the number.
+                    if m:
+                        text_v += m[1] + m[3]
+                        num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
+                        # Append white space.
+                        num_v += " "
+                    elif j == tk_ct - 1:
+                        # if last token is non-numerical, just tack it on.
+                        logger.debug(f"Last token is non-numerical: {tk_j}")
+                        text_v += tk_j
+                        num_v += tk_j
+                    # evaluate numerical string that has been constructed so
+                    # far. Use num version for ordinals and date strings
+                    if (
+                        search("^di [0-9]", num_v, flags=I) or
+                        search("[0-9] [0-9] [0-9] [0-9]", num_v) or
+                        search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
+                        search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
+                    ):
+                        use_num_v = True
+                        # At this point, string may contain literal
+                        # translations of Chinese numerals Convert these to
+                        # Arabic numerals (for example "2 10 7" = "27").
+                        mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
+                        sum_ptn = compile("([1-9]0+) ([0-9]+)")
+                        while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
+                            logger.debug(f"Match number combination: {_m}")
+                            if m := mult_ptn.search(num_v):
+                                logger.debug(f"Multiply: {m[1]}, {m[2]}")
+                                parsed = int(m[1]) * int(m[2])
+                                num_v = mult_ptn.sub(str(parsed), num_v, 1)
+                            elif m := sum_ptn.search(num_v):
+                                logger.debug(f"Add: {m[1]}, {m[2]}")
+                                parsed = int(m[1]) + int(m[2])
+                                num_v = sum_ptn.sub(str(parsed), num_v, 1)
+                            else:
+                                break
+                        # A few other tweaks
+                        num_v = sub(
+                                "([0-9]) ([0-9]) ([0-9]) ([0-9])",
+                                r"\1\2\3\4", num_v)
+                        if ctx.options.get("marc_field") in ("245", "830"):
+                            # TODO optimize without loop.
+                            while search("[0-9] [0-9]", num_v):
+                                num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
+
+                    output += num_v if use_num_v else text_v
+
+                    # if the end of the string is not reached, backtrack to the
+                    # delimiter after the last numerical token (i.e. two tokens
+                    # ago).
+                    #
+                    # Else, we are at the end of the string, so we are done!
+                    i = j - 1 if j < tk_ct - 1 else j
+                    break
+
+                # this is run when we are not yet at the end of the string and
+                # have not yet reached a non-numerical token. This is identical
+                # to the code that is run above when the last token is numeric,
+                # except that whitespace after the token is stripped.
+                m = search(token_ptn, tk_j)
+                text_v += m[1] + " "
+                num_v += m[2] if len(m[2]) else m[1]
+                num_v += " "
+
+        else:
+            logger.debug(f"No numeric match: adding {tk_i}.")
+            output += tk_i
+
+        i += 1
+
+    logger.debug(f"Use num version: {use_num_v}")
+    ctx.dest = output
+
+    # Skip main transliterate function joining.
+
+    return normalize_spacing_post_assembly(ctx)