WIP complete number parsing.

lcnetdev · Apr 11, 2024 · 4f3d021 · 4f3d021
1 parent f4bf40f
commit 4f3d021
Show file tree

Hide file tree

Showing 2 changed files with 231 additions and 50 deletions.
diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py
@@ -1,7 +1,33 @@
 __doc__ = """Chinese hooks."""
 
 
-from re import I, compile, match, split, sub
+from logging import getLogger
+from os import path
+from re import I, compile, search, sub
+
+from yaml import load
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+
+HOOK_DIR = path.dirname(path.realpath(__file__))
+
+logger = getLogger(__name__)
+
+
+def merge_numerals_pre_config(tdata):
+    """
+    Add numerals mapping to configuration.
+
+    This overrides the existing character mappings.
+    """
+    num_map_yml = path.join(HOOK_DIR, "numerals.yml")
+    with open(num_map_yml, "r") as fh:
+        num_map = load(fh, Loader=Loader)
+
+    tdata["script_to_roman"]["map"].update(num_map)
 
 
 def parse_numerals(ctx):
@@ -13,100 +39,111 @@ def parse_numerals(ctx):
     # Only apply to specific MARC fields.
     use_num_v = ctx.options.get("marc_field") in ("245", "830")
 
-    tokens = split(r"[\W^#]", ctx.dest)
+    # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
+    tokens = [tk.strip() for tk in ctx.dest_ls]
     tk_ct = len(tokens)
-
-    token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$")
 
     output = ""
 
     # Use manual loop as i is manipulated inside it.
     i = 0
+
     while i < tk_ct:
         tk_i = tokens[i]
-        if match(token_ptn, tk_i):
+        if search(token_ptn, tk_i):
+            # When a numerical token (containing #) is reached, the inner loop
+            # consumes it and all consecutive numerical tokens found after it.
+            # Two versions of the string are maintained. The textVersion is
+            # the original pinyin (minus the # suffixes). In the numVersion,
+            # characters representing numbers are converted to Arabic
+            # numerals. When a non-numerical token (or end of string) is
+            # encountered, the string of numerical tokens is evaluated to
+            # determine which version should be used in the output string.
+            # The outer loop then continues where the inner loop left off.
+            logger.debug(f"Match number: {tk_i}")
             text_v = num_v = ""
-            for j, tk_j in enumerate(tokens):
-                m = match(token_ptn, tk_j)
+            for j in range(i, tk_ct):
+                tk_j = tokens[j]
+                m = search(token_ptn, tk_j)
+                # if m:
+                #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
                 # a token without # (or the end of string) is reached
-                if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
+                if not m or j == tk_ct - 1:
+                    logger.debug(f"Next token is not numeric: {tk_j}")
                     # If this runs, then we are on the last token and it is
                     # numeric. Add text after # (if present) to numerical
                     # version
                     if m:
-                        text_v += m[1]
-                        num_v += m[2] if m[2] else m[1]
+                        text_v += m[1] + " "
+                        num_v += m[2] if len(m[2]) else m[1]
+                        # Append white space.
+                        num_v += " "
                     elif j == tk_ct - 1:
                         # if last token is non-numerical, just tack it on.
+                        logger.debug(f"Last token is non-numerical: {tk_j}")
                         text_v += tk_j
                         num_v += tk_j
-                    elif len(text_v) and len(num_v):
-                        # if not at end of string yet and token is
-                        # non-numerical, remove the last delimiter that was
-                        # appended (outer loop will pick up at this point)
-                        text_v = text_v[:-1]
-                        num_v = num_v[:-1]
                     # evaluate numerical string that has been constructed so
                     # far. Use num version for ordinals and date strings
                     if (
-                        match("^di [0-9]", num_v, flags=I) or
-                        match("[0-9] [0-9] [0-9] [0-9]", num_v) or
-                        match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
-                        match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
+                        search("^di [0-9]", num_v, flags=I) or
+                        search("[0-9] [0-9] [0-9] [0-9]", num_v) or
+                        search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
+                        search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
                     ):
                         use_num_v = True
                         # At this point, string may contain literal
                         # translations of Chinese numerals Convert these to
                         # Arabic numerals (for example "2 10 7" = "27").
-                        while (
-                                match(num_v, "[0-9] 10+") or
-                                match(num_v, "[1-9]0+ [1-9]")):
-                            m = match(num_v, "([0-9]+) ([1-9]0+)")
-                            if m:
-                                parsed_sum = int(m[1]) + int(m[2])
-                                num_v = sub(
-                                        "[0-9]+ [1-9]0+", str(parsed_sum),
-                                        num_v, 1)
+                        mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
+                        sum_ptn = compile("([1-9]0+) ([0-9]+)")
+                        while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
+                            logger.debug(f"Match number combination: {_m}")
+                            if m := mult_ptn.search(num_v):
+                                logger.debug(f"Multiply: {m[1]}, {m[2]}")
+                                parsed = int(m[1]) * int(m[2])
+                                num_v = mult_ptn.sub(str(parsed), num_v, 1)
+                            elif m := sum_ptn.search(num_v):
+                                logger.debug(f"Add: {m[1]}, {m[2]}")
+                                parsed = int(m[1]) + int(m[2])
+                                num_v = sum_ptn.sub(str(parsed), num_v, 1)
                             else:
-                                mb = match(num_v, "([1-9]0+) ([0-9]+)")
-                                if mb:
-                                    parsed_sum_b = int(m[1]) + int(m[2])
-                                    num_v = sub(
-                                            "[1-9]0+ [0-9]+",
-                                            str(parsed_sum_b), num_v, 1)
-                                else:
-                                    break
+                                break
                         # A few other tweaks
                         num_v = sub(
                                 "([0-9]) ([0-9]) ([0-9]) ([0-9])",
                                 r"\1\2\3\4", num_v)
                         if ctx.options.get("marc_field") in ("245", "830"):
                             # TODO optimize without loop.
-                            while match("[0-9] [0-9]", num_v):
+                            while search("[0-9] [0-9]", num_v):
                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
 
+                    logger.debug(f"num_v: {num_v}")
+                    logger.debug(f"text_v: {text_v}")
                     output += num_v if use_num_v else text_v
 
                     # if the end of the string is not reached, backtrack to the
                     # delimiter after the last numerical token (i.e. two tokens
-                    # ago)
-
-                    i = j - 2 if j < tk_ct - 1 else j
+                    # ago).
+                    #
+                    # Else, we are at the end of the string, so we are done!
+                    i = j - 1 if j < tk_ct - 1 else j
                     break
 
                 # this is run when we are not yet at the end of the string and
                 # have not yet reached a non-numerical token. This is identical
                 # to the code that is run above when the last token is numeric.
-
-                if j % 2 == 0:
-                    m = match(token_ptn, tk_j)
-                    text_v += m[1]
-                    num_v += m[2] if m[2] else m[1]
-                else:
-                    text_v += tk_j
-                    num_v += tk_j
+                m = search(token_ptn, tk_j)
+                text_v += m[1] + " "
+                num_v += m[2] if len(m[2]) else m[1]
+                num_v += " "
 
         else:
-            output += tk_i
+            logger.debug(f"No match: adding {tk_i}.")
+            output += tk_i + " "
+
+        i += 1
 
+    print(f"Use num version: {use_num_v}")
     ctx.dest = output
diff --git a/scriptshifter/hooks/chinese/processNumbers.ts b/scriptshifter/hooks/chinese/processNumbers.ts
@@ -0,0 +1,144 @@
+private processNumbers(pinyinString: string, tag: string, code: string): string {
+    let outputString = "";
+    let useNumVersion = false;
+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
+    if ((tag == "245" || tag == "830") && code == "n") {
+       useNumVersion = true;
+    }
+
+    /*
+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
+     * values of j.
+     */
+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
+    let numToken_re = new RegExp(numTokenPattern);
+    let n = tokens.length
+    //this.alert.info(tokens.join("|"),{autoClose: false})
+    for (let i = 0; i < n; i++) {
+        let toki = tokens[i];
+        if (toki.match(numToken_re)) {
+            /*
+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
+             */
+            let textVersion = "";
+            let numVersion = "";
+            for (let j = i; j < n; j++) {
+                let tokj = tokens[j];
+                /* a token without # (or the end of string) is reached */
+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
+                    let m = tokj.match(numToken_re);
+                    if (m) {
+                        textVersion += m[1]
+                        if (m[2] == "") {
+                            numVersion += m[1];
+                        } else {
+                            numVersion += m[2];
+                        }
+                    } else if (j == n - 1) {
+                    //if last token is non-numerical, just tack it on.
+                        textVersion += tokj;
+                        numVersion += tokj;
+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
+                    //(outer loop will pick up at this point)
+                        textVersion = textVersion.substring(0, textVersion.length - 1);
+                        numVersion = numVersion.substring(0, numVersion.length - 1);
+                    }
+                    //evaluate numerical string that has been constructed so far
+                    //use num version for ordinals and date strings
+                    if (numVersion.match(/^di [0-9]/i) ||
+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
+                        useNumVersion
+                       ) {
+                        useNumVersion = true;
+                        /*
+                         * At this point, string may contain literal translations of Chinese numerals
+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
+                         */
+
+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
+                            if (m) {
+                                let sum = Number(m[1]) * Number(m[2]);
+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
+                            } else {
+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
+                                if (mb)
+                                {
+                                    let sumb = Number(mb[1]) + Number(mb[2]);
+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        //A few other tweaks
+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
+                        if ((tag == "245" || tag == "830") && code == "n") {
+                            while (numVersion.match(/[0-9] [0-9]/)) {
+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
+                            }
+                        }
+                    }
+                    if (useNumVersion)
+                    {
+                        outputString += numVersion;
+                    }
+                    else
+                    {
+                        outputString += textVersion;
+                    }
+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
+                    //(i.e. two tokens ago)
+                    if (j < n - 1)
+                    {
+                        i = j - 2;
+                    }
+                    else //we are at the end of the string, so we are done!
+                    {
+                        i = j;
+                    }
+                    break;
+                }
+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
+                //This is identical to the code that is run above when the last token is numeric.
+                if (j % 2 == 0)
+                {
+                    let m = tokj.match(numToken_re);
+                    textVersion += m[1];
+                    if (m[2]== "")
+                    {
+                        numVersion += m[1];
+                    }
+                    else
+                    {
+                        numVersion += m[2];
+                    }
+                }
+                else //a delimiter, just tack it on.
+                {
+                    textVersion += tokj;
+                    numVersion += tokj;
+                }
+            }
+        }
+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
+        {
+            outputString += toki;
+        }
+    }
+    return outputString;
+ }