Skip to content

Commit

Permalink
WIP complete number parsing.
Browse files Browse the repository at this point in the history
  • Loading branch information
scossu committed Apr 11, 2024
1 parent f4bf40f commit 4f3d021
Show file tree
Hide file tree
Showing 2 changed files with 231 additions and 50 deletions.
137 changes: 87 additions & 50 deletions scriptshifter/hooks/chinese/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,33 @@
__doc__ = """Chinese hooks."""


from re import I, compile, match, split, sub
from logging import getLogger
from os import path
from re import I, compile, search, sub

from yaml import load
try:
from yaml import CLoader as Loader
except ImportError:
from yaml import Loader


HOOK_DIR = path.dirname(path.realpath(__file__))

logger = getLogger(__name__)


def merge_numerals_pre_config(tdata):
"""
Add numerals mapping to configuration.
This overrides the existing character mappings.
"""
num_map_yml = path.join(HOOK_DIR, "numerals.yml")
with open(num_map_yml, "r") as fh:
num_map = load(fh, Loader=Loader)

tdata["script_to_roman"]["map"].update(num_map)


def parse_numerals(ctx):
Expand All @@ -13,100 +39,111 @@ def parse_numerals(ctx):
# Only apply to specific MARC fields.
use_num_v = ctx.options.get("marc_field") in ("245", "830")

tokens = split(r"[\W^#]", ctx.dest)
# tokens = split(r"[\W^#]", ctx.dest) # Original logic.
tokens = [tk.strip() for tk in ctx.dest_ls]
tk_ct = len(tokens)

token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$")

output = ""

# Use manual loop as i is manipulated inside it.
i = 0

while i < tk_ct:
tk_i = tokens[i]
if match(token_ptn, tk_i):
if search(token_ptn, tk_i):
# When a numerical token (containing #) is reached, the inner loop
# consumes it and all consecutive numerical tokens found after it.
# Two versions of the string are maintained. The textVersion is
# the original pinyin (minus the # suffixes). In the numVersion,
# characters representing numbers are converted to Arabic
# numerals. When a non-numerical token (or end of string) is
# encountered, the string of numerical tokens is evaluated to
# determine which version should be used in the output string.
# The outer loop then continues where the inner loop left off.
logger.debug(f"Match number: {tk_i}")
text_v = num_v = ""
for j, tk_j in enumerate(tokens):
m = match(token_ptn, tk_j)
for j in range(i, tk_ct):
tk_j = tokens[j]
m = search(token_ptn, tk_j)
# if m:
# logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
# a token without # (or the end of string) is reached
if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
if not m or j == tk_ct - 1:
logger.debug(f"Next token is not numeric: {tk_j}")
# If this runs, then we are on the last token and it is
# numeric. Add text after # (if present) to numerical
# version
if m:
text_v += m[1]
num_v += m[2] if m[2] else m[1]
text_v += m[1] + " "
num_v += m[2] if len(m[2]) else m[1]
# Append white space.
num_v += " "
elif j == tk_ct - 1:
# if last token is non-numerical, just tack it on.
logger.debug(f"Last token is non-numerical: {tk_j}")
text_v += tk_j
num_v += tk_j
elif len(text_v) and len(num_v):
# if not at end of string yet and token is
# non-numerical, remove the last delimiter that was
# appended (outer loop will pick up at this point)
text_v = text_v[:-1]
num_v = num_v[:-1]
# evaluate numerical string that has been constructed so
# far. Use num version for ordinals and date strings
if (
match("^di [0-9]", num_v, flags=I) or
match("[0-9] [0-9] [0-9] [0-9]", num_v) or
match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
search("^di [0-9]", num_v, flags=I) or
search("[0-9] [0-9] [0-9] [0-9]", num_v) or
search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
):
use_num_v = True
# At this point, string may contain literal
# translations of Chinese numerals Convert these to
# Arabic numerals (for example "2 10 7" = "27").
while (
match(num_v, "[0-9] 10+") or
match(num_v, "[1-9]0+ [1-9]")):
m = match(num_v, "([0-9]+) ([1-9]0+)")
if m:
parsed_sum = int(m[1]) + int(m[2])
num_v = sub(
"[0-9]+ [1-9]0+", str(parsed_sum),
num_v, 1)
mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
sum_ptn = compile("([1-9]0+) ([0-9]+)")
while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
logger.debug(f"Match number combination: {_m}")
if m := mult_ptn.search(num_v):
logger.debug(f"Multiply: {m[1]}, {m[2]}")
parsed = int(m[1]) * int(m[2])
num_v = mult_ptn.sub(str(parsed), num_v, 1)
elif m := sum_ptn.search(num_v):
logger.debug(f"Add: {m[1]}, {m[2]}")
parsed = int(m[1]) + int(m[2])
num_v = sum_ptn.sub(str(parsed), num_v, 1)
else:
mb = match(num_v, "([1-9]0+) ([0-9]+)")
if mb:
parsed_sum_b = int(m[1]) + int(m[2])
num_v = sub(
"[1-9]0+ [0-9]+",
str(parsed_sum_b), num_v, 1)
else:
break
break
# A few other tweaks
num_v = sub(
"([0-9]) ([0-9]) ([0-9]) ([0-9])",
r"\1\2\3\4", num_v)
if ctx.options.get("marc_field") in ("245", "830"):
# TODO optimize without loop.
while match("[0-9] [0-9]", num_v):
while search("[0-9] [0-9]", num_v):
num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)

logger.debug(f"num_v: {num_v}")
logger.debug(f"text_v: {text_v}")
output += num_v if use_num_v else text_v

# if the end of the string is not reached, backtrack to the
# delimiter after the last numerical token (i.e. two tokens
# ago)

i = j - 2 if j < tk_ct - 1 else j
# ago).
#
# Else, we are at the end of the string, so we are done!
i = j - 1 if j < tk_ct - 1 else j
break

# this is run when we are not yet at the end of the string and
# have not yet reached a non-numerical token. This is identical
# to the code that is run above when the last token is numeric.

if j % 2 == 0:
m = match(token_ptn, tk_j)
text_v += m[1]
num_v += m[2] if m[2] else m[1]
else:
text_v += tk_j
num_v += tk_j
m = search(token_ptn, tk_j)
text_v += m[1] + " "
num_v += m[2] if len(m[2]) else m[1]
num_v += " "

else:
output += tk_i
logger.debug(f"No match: adding {tk_i}.")
output += tk_i + " "

i += 1

print(f"Use num version: {use_num_v}")
ctx.dest = output
144 changes: 144 additions & 0 deletions scriptshifter/hooks/chinese/processNumbers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
private processNumbers(pinyinString: string, tag: string, code: string): string {
let outputString = "";
let useNumVersion = false;
//useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
if ((tag == "245" || tag == "830") && code == "n") {
useNumVersion = true;
}

/*
* The input string is split, with any space or punctuation character (except for #) as the delimiter.
* The delimiters will be captured and included in the string of tokens. Only the even-numbered
* array elements are the true 'tokens', so the code for processing tokens is run only for even
* values of j.
*/
let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
let numToken_re = new RegExp(numTokenPattern);
let n = tokens.length
//this.alert.info(tokens.join("|"),{autoClose: false})
for (let i = 0; i < n; i++) {
let toki = tokens[i];
if (toki.match(numToken_re)) {
/*
* When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
* found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the
* # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a
* non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
* which version should be used in the output string. The outer loop then continues where the inner loop left off.
*/
let textVersion = "";
let numVersion = "";
for (let j = i; j < n; j++) {
let tokj = tokens[j];
/* a token without # (or the end of string) is reached */
if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
//If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
let m = tokj.match(numToken_re);
if (m) {
textVersion += m[1]
if (m[2] == "") {
numVersion += m[1];
} else {
numVersion += m[2];
}
} else if (j == n - 1) {
//if last token is non-numerical, just tack it on.
textVersion += tokj;
numVersion += tokj;
} else if (textVersion.length > 0 && numVersion.length > 0) {
//if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
//(outer loop will pick up at this point)
textVersion = textVersion.substring(0, textVersion.length - 1);
numVersion = numVersion.substring(0, numVersion.length - 1);
}
//evaluate numerical string that has been constructed so far
//use num version for ordinals and date strings
if (numVersion.match(/^di [0-9]/i) ||
numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
useNumVersion
) {
useNumVersion = true;
/*
* At this point, string may contain literal translations of Chinese numerals
* Convert these to Arabic numerals (for example "2 10 7" = "27").
*/

while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
m = numVersion.match(/([0-9]+) ([1-9]0+)/);
if (m) {
let sum = Number(m[1]) * Number(m[2]);
numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
} else {
let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
if (mb)
{
let sumb = Number(mb[1]) + Number(mb[2]);
numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
}
else
{
break;
}
}
}

//A few other tweaks
numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
if ((tag == "245" || tag == "830") && code == "n") {
while (numVersion.match(/[0-9] [0-9]/)) {
numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
}
}
}
if (useNumVersion)
{
outputString += numVersion;
}
else
{
outputString += textVersion;
}
//if the end of the string is not reached, backtrack to the delimiter after the last numerical token
//(i.e. two tokens ago)
if (j < n - 1)
{
i = j - 2;
}
else //we are at the end of the string, so we are done!
{
i = j;
}
break;
}
//this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
//This is identical to the code that is run above when the last token is numeric.
if (j % 2 == 0)
{
let m = tokj.match(numToken_re);
textVersion += m[1];
if (m[2]== "")
{
numVersion += m[1];
}
else
{
numVersion += m[2];
}
}
else //a delimiter, just tack it on.
{
textVersion += tokj;
numVersion += tokj;
}
}
}
else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
{
outputString += toki;
}
}
return outputString;
}

0 comments on commit 4f3d021

Please sign in to comment.