Skip to content

Commit

Permalink
Merge branch 'main' of gh:lcnetdev/scriptshifter
Browse files Browse the repository at this point in the history
  • Loading branch information
scossu committed May 19, 2024
2 parents 5eccdce + 99dcaac commit 26c3513
Show file tree
Hide file tree
Showing 38 changed files with 46,529 additions and 45,745 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
name: Push image to Docker Hub.
name: Push app image
on:
# This runs on v *.*.0 after the base image has been
# built and pushed, or on patch version tag.
push:
tags:
- "v*.*.*"
- "v*.*.[1-9]*"
workflow_run:
workflows:
- "Push base image"
types:
- "completed"

env:
DOCKER_USER: lcnetdev
Expand All @@ -13,13 +20,15 @@ jobs:
push-image-to-docker-hub:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: checkout repo
uses: actions/checkout@v4
with:
submodules: recursive

- name: Build the Docker image
run: >
docker build . --tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
docker build -f Dockerfile .
--tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
--tag $DOCKER_USER/$REPO_NAME:latest
- name: Login to Docker Hub
Expand Down
46 changes: 46 additions & 0 deletions .github/workflows/push-base-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Push base image
on:
push:
tags:
- "v*.*.0"

env:
DOCKER_USER: lcnetdev
DOCKER_PASSWORD: ${{secrets.DOCKER_HUB}}
REPO_NAME: scriptshifter-base

jobs:
push-image-to-docker-hub:
runs-on: ubuntu-latest
steps:
- name: checkout repo
uses: actions/checkout@v4
with:
submodules: recursive

- name: checkout yiddish submodules (1/2)
uses: actions/checkout@v4
with:
repository: ibleaman/loshn-koydesh-pronunciation
path: ext/yiddish/yiddish/submodules/loshn-koydesh-pronunciation

- name: checkout yiddish submodules (2/2)
uses: actions/checkout@v4
with:
repository: ibleaman/hasidify_lexicon
path: ext/yiddish/yiddish/submodules/hasidify_lexicon

- name: Build the Docker image
run: >
docker build -f scriptshifter_base.Dockerfile .
--tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
--tag $DOCKER_USER/$REPO_NAME:latest
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: lcnetdev
password: ${{ secrets.DOCKER_HUB }}

- name: Push to Docker Hub
run: docker push $DOCKER_USER/$REPO_NAME --all-tags
14 changes: 9 additions & 5 deletions .github/workflows/push-test-image.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: Push test image to Docker Hub.
name: Push test image
on:
push:
branch:
- "main"
branches:
- "test"

env:
DOCKER_USER: lcnetdev
Expand All @@ -13,12 +13,16 @@ jobs:
push-image-to-docker-hub:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: checkout repo
uses: actions/checkout@v4
with:
submodules: recursive

- name: Build the Docker image
run: docker build . --tag $DOCKER_USER/$REPO_NAME:test
run: >
docker build -f Dockerfile .
--tag $DOCKER_USER/$REPO_NAME:${{ github.ref_name }}
--tag $DOCKER_USER/$REPO_NAME:test
- name: Login to Docker Hub
uses: docker/login-action@v3
Expand Down
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[submodule "ext/arabic_rom"]
path = ext/arabic_rom
url = https://github.com/fadhleryani/Arabic_ALA-LC_Romanization.git
[submodule "ext/yiddish"]
path = ext/yiddish
url = https://github.com/scossu/yiddish.git
branch = loc
28 changes: 7 additions & 21 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,15 @@
FROM python:3.10-slim-bullseye

RUN apt update
RUN apt install -y build-essential tzdata gfortran libopenblas-dev libboost-all-dev

ENV TZ=America/New_York
ENV _workroot "/usr/local/scriptshifter/src"

WORKDIR ${_workroot}
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

# Remove development packages.
RUN apt remove -y build-essential
RUN apt autoremove -y

RUN addgroup --system www
RUN adduser --system www
RUN gpasswd -a www www
FROM lcnetdev/scriptshifter-base:latest
ARG WORKROOT "/usr/local/scriptshifter/src"

# Copy core application files.
WORKDIR ${WORKROOT}
COPY entrypoint.sh uwsgi.ini wsgi.py ./
COPY ext ./ext/
COPY scriptshifter ./scriptshifter/
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

RUN chmod +x ./entrypoint.sh
RUN chown -R www:www ${_workroot} .
#RUN chown -R www:www ${WORKROOT} .

EXPOSE 8000

Expand Down
7 changes: 7 additions & 0 deletions deps.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# External dependencies.
aksharamukha>=2.2,<3
camel-tools>=1.5
funcy>=1.15,<2
pymarc>=4.0,<5
repackage>=0.7.3
./ext/yiddish
2 changes: 1 addition & 1 deletion doc/hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ and return it before any further default processing is done.

#### Output

`"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
String or `None`. If a string, the transliteration function returns that
immediately; otherwise it proceeds with standard adjustments of the output
string before returning.

Expand Down
23 changes: 23 additions & 0 deletions doc/rest_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ Transliterate an input string into a given language.

### POST body

MIME type: `application/json`

Content: JSON object with the following keys:

- `lang`: Language code as given by the `/languages` endpoint.
- `text`: Input text to be transliterated.
- `capitalize`: One of `first` (capitalize the first letter of the input),
Expand All @@ -92,3 +96,22 @@ Content: JSON object containing two keys: `ouput` containing the transliterated
string; and `warnings` containing a list of warnings. Characters not found in
the mapping are copied verbatim in the transliterated string (see
"Configuration files" section for more information).

## `POST /feedback`

Send a feedback form about a transliteration result.

### POST body

MIME type: `application/json`

Content: JSON object with the following keys:

`lang`: language of the transliteration. Mandatory.
`src`: source text. Mandatory.
`t_dir`: transliteration direction. If omitted, it defaults to `s2r`.
`result`: result of the transliteration. Mandatory.
`expected`: expected result. Mandatory.
`options`: options passed to the request, if any.
`notes`: optional user notes.
`contact`: contact email for feedback. Optional.
1 change: 1 addition & 0 deletions ext/yiddish
Submodule yiddish added at 9bf22c
6 changes: 1 addition & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
aksharamukha>=2.1,<3
camel-tools>=1.5
# Core application dependencies.
flask>=2.3,<3
funcy>=1.15,<2
pymarc>=4.0,<5
python-dotenv>=1.0,<2
pyyaml>=6.0,<7
repackage>=0.7.3
uwsgi>=2.0,<2.1
18 changes: 12 additions & 6 deletions scriptshifter/hooks/aksharamukha/romanizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,22 @@
logger = getLogger(__name__)


def s2r_post_config(ctx, src_script):
def s2r_post_config(ctx, src_script, pre=[], post=[]):
# options = detect_preoptions(ctx.src, src_script)
options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
ctx.dest = process(src_script, "IAST", ctx.src, pre_options=options)
pre_options = pre + [
n for n, v in ctx.options.items() if v and n != "capitalize"]
ctx.dest = process(
src_script, "RomanLoC", ctx.src,
pre_options=pre_options, post_options=post)

return BREAK


def r2s_post_config(ctx, dest_script):
options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
ctx.dest = process("IAST", dest_script, ctx.src, post_options=options)
def r2s_post_config(ctx, dest_script, pre=[], post=[]):
post_options = post + [
n for n, v in ctx.options.items() if v and n != "capitalize"]
ctx.dest = process(
"RomanLoC", dest_script, ctx.src,
pre_options=pre, post_options=post_options)

return BREAK
131 changes: 131 additions & 0 deletions scriptshifter/hooks/chinese/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
__doc__ = """Chinese hooks."""


from logging import getLogger
from re import I, compile, search, sub

from scriptshifter.hooks.general import normalize_spacing_post_assembly


logger = getLogger(__name__)


def parse_numerals_pre_assembly(ctx):
"""
Parse Chinese numerals in the already romanized result.
This is run at post-assembly.
"""
# Only apply to specific MARC fields.
use_num_v = ctx.options.get("marc_field") in ("245n", "830n")

# tokens = split(r"[\W^#]", ctx.dest) # Original logic.
tk_ct = len(ctx.dest_ls)
token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")

output = ""

# Use manual loop as i is manipulated inside it.
i = 0

while i < tk_ct:
tk_i = ctx.dest_ls[i]
if search(token_ptn, tk_i):
# When a numerical token (containing #) is reached, the inner loop
# consumes it and all consecutive numerical tokens found after it.
# Two versions of the string are maintained. The textVersion is
# the original pinyin (minus the # suffixes). In the numVersion,
# characters representing numbers are converted to Arabic
# numerals. When a non-numerical token (or end of string) is
# encountered, the string of numerical tokens is evaluated to
# determine which version should be used in the output string.
# The outer loop then continues where the inner loop left off.
logger.debug(f"Match number: {tk_i}.")
text_v = num_v = ""
for j in range(i, tk_ct):
tk_j = ctx.dest_ls[j]
m = search(token_ptn, tk_j)
# if m:
# logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
# a token without # (or the end of string) is reached
if not m or j == tk_ct - 1:
logger.debug(f"Next token is not numeric: {tk_j}")
# If this runs, then we are on the last token and it is
# numeric. Add text after # (if present) to numerical
# version and captured whitespace after the number.
if m:
text_v += m[1] + m[3]
num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
# Append white space.
num_v += " "
elif j == tk_ct - 1:
# if last token is non-numerical, just tack it on.
logger.debug(f"Last token is non-numerical: {tk_j}")
text_v += tk_j
num_v += tk_j
# evaluate numerical string that has been constructed so
# far. Use num version for ordinals and date strings
if (
search("^di [0-9]", num_v, flags=I) or
search("[0-9] [0-9] [0-9] [0-9]", num_v) or
search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
):
use_num_v = True
# At this point, string may contain literal
# translations of Chinese numerals Convert these to
# Arabic numerals (for example "2 10 7" = "27").
mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
sum_ptn = compile("([1-9]0+) ([0-9]+)")
while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
logger.debug(f"Match number combination: {_m}")
if m := mult_ptn.search(num_v):
logger.debug(f"Multiply: {m[1]}, {m[2]}")
parsed = int(m[1]) * int(m[2])
num_v = mult_ptn.sub(str(parsed), num_v, 1)
elif m := sum_ptn.search(num_v):
logger.debug(f"Add: {m[1]}, {m[2]}")
parsed = int(m[1]) + int(m[2])
num_v = sum_ptn.sub(str(parsed), num_v, 1)
else:
break
# A few other tweaks
num_v = sub(
"([0-9]) ([0-9]) ([0-9]) ([0-9])",
r"\1\2\3\4", num_v)
if ctx.options.get("marc_field") in ("245", "830"):
# TODO optimize without loop.
while search("[0-9] [0-9]", num_v):
num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)

output += num_v if use_num_v else text_v

# if the end of the string is not reached, backtrack to the
# delimiter after the last numerical token (i.e. two tokens
# ago).
#
# Else, we are at the end of the string, so we are done!
i = j - 1 if j < tk_ct - 1 else j
break

# this is run when we are not yet at the end of the string and
# have not yet reached a non-numerical token. This is identical
# to the code that is run above when the last token is numeric,
# except that whitespace after the token is stripped.
m = search(token_ptn, tk_j)
text_v += m[1] + " "
num_v += m[2] if len(m[2]) else m[1]
num_v += " "

else:
logger.debug(f"No numeric match: adding {tk_i}.")
output += tk_i

i += 1

logger.debug(f"Use num version: {use_num_v}")
ctx.dest = output

# Skip main transliterate function joining.

return normalize_spacing_post_assembly(ctx)
Loading

0 comments on commit 26c3513

Please sign in to comment.