From 4f3d021c63573eefcb54609fbe7bff13e6895857 Mon Sep 17 00:00:00 2001 From: scossu Date: Thu, 11 Apr 2024 09:08:28 -0400 Subject: [PATCH] WIP complete number parsing. --- scriptshifter/hooks/chinese/__init__.py | 137 +++++++++++------ scriptshifter/hooks/chinese/processNumbers.ts | 144 ++++++++++++++++++ 2 files changed, 231 insertions(+), 50 deletions(-) create mode 100644 scriptshifter/hooks/chinese/processNumbers.ts diff --git a/scriptshifter/hooks/chinese/__init__.py b/scriptshifter/hooks/chinese/__init__.py index 4305223..c6f5b8a 100644 --- a/scriptshifter/hooks/chinese/__init__.py +++ b/scriptshifter/hooks/chinese/__init__.py @@ -1,7 +1,33 @@ __doc__ = """Chinese hooks.""" -from re import I, compile, match, split, sub +from logging import getLogger +from os import path +from re import I, compile, search, sub + +from yaml import load +try: + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + + +HOOK_DIR = path.dirname(path.realpath(__file__)) + +logger = getLogger(__name__) + + +def merge_numerals_pre_config(tdata): + """ + Add numerals mapping to configuration. + + This overrides the existing character mappings. + """ + num_map_yml = path.join(HOOK_DIR, "numerals.yml") + with open(num_map_yml, "r") as fh: + num_map = load(fh, Loader=Loader) + + tdata["script_to_roman"]["map"].update(num_map) def parse_numerals(ctx): @@ -13,100 +39,111 @@ def parse_numerals(ctx): # Only apply to specific MARC fields. use_num_v = ctx.options.get("marc_field") in ("245", "830") - tokens = split(r"[\W^#]", ctx.dest) + # tokens = split(r"[\W^#]", ctx.dest) # Original logic. + tokens = [tk.strip() for tk in ctx.dest_ls] tk_ct = len(tokens) - - token_ptn = compile("^([A-Za-z]+)#([0-9]*)$") + token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$") output = "" # Use manual loop as i is manipulated inside it. i = 0 + while i < tk_ct: tk_i = tokens[i] - if match(token_ptn, tk_i): + if search(token_ptn, tk_i): + # When a numerical token (containing #) is reached, the inner loop + # consumes it and all consecutive numerical tokens found after it. + # Two versions of the string are maintained. The textVersion is + # the original pinyin (minus the # suffixes). In the numVersion, + # characters representing numbers are converted to Arabic + # numerals. When a non-numerical token (or end of string) is + # encountered, the string of numerical tokens is evaluated to + # determine which version should be used in the output string. + # The outer loop then continues where the inner loop left off. + logger.debug(f"Match number: {tk_i}") text_v = num_v = "" - for j, tk_j in enumerate(tokens): - m = match(token_ptn, tk_j) + for j in range(i, tk_ct): + tk_j = tokens[j] + m = search(token_ptn, tk_j) + # if m: + # logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}") # a token without # (or the end of string) is reached - if ((j % 2 == 0 and not m) or j == len(tokens) - 1): + if not m or j == tk_ct - 1: + logger.debug(f"Next token is not numeric: {tk_j}") # If this runs, then we are on the last token and it is # numeric. Add text after # (if present) to numerical # version if m: - text_v += m[1] - num_v += m[2] if m[2] else m[1] + text_v += m[1] + " " + num_v += m[2] if len(m[2]) else m[1] + # Append white space. + num_v += " " elif j == tk_ct - 1: # if last token is non-numerical, just tack it on. + logger.debug(f"Last token is non-numerical: {tk_j}") text_v += tk_j num_v += tk_j - elif len(text_v) and len(num_v): - # if not at end of string yet and token is - # non-numerical, remove the last delimiter that was - # appended (outer loop will pick up at this point) - text_v = text_v[:-1] - num_v = num_v[:-1] # evaluate numerical string that has been constructed so # far. Use num version for ordinals and date strings if ( - match("^di [0-9]", num_v, flags=I) or - match("[0-9] [0-9] [0-9] [0-9]", num_v) or - match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or - match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I) + search("^di [0-9]", num_v, flags=I) or + search("[0-9] [0-9] [0-9] [0-9]", num_v) or + search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or + search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I) ): use_num_v = True # At this point, string may contain literal # translations of Chinese numerals Convert these to # Arabic numerals (for example "2 10 7" = "27"). - while ( - match(num_v, "[0-9] 10+") or - match(num_v, "[1-9]0+ [1-9]")): - m = match(num_v, "([0-9]+) ([1-9]0+)") - if m: - parsed_sum = int(m[1]) + int(m[2]) - num_v = sub( - "[0-9]+ [1-9]0+", str(parsed_sum), - num_v, 1) + mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)") + sum_ptn = compile("([1-9]0+) ([0-9]+)") + while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v): + logger.debug(f"Match number combination: {_m}") + if m := mult_ptn.search(num_v): + logger.debug(f"Multiply: {m[1]}, {m[2]}") + parsed = int(m[1]) * int(m[2]) + num_v = mult_ptn.sub(str(parsed), num_v, 1) + elif m := sum_ptn.search(num_v): + logger.debug(f"Add: {m[1]}, {m[2]}") + parsed = int(m[1]) + int(m[2]) + num_v = sum_ptn.sub(str(parsed), num_v, 1) else: - mb = match(num_v, "([1-9]0+) ([0-9]+)") - if mb: - parsed_sum_b = int(m[1]) + int(m[2]) - num_v = sub( - "[1-9]0+ [0-9]+", - str(parsed_sum_b), num_v, 1) - else: - break + break # A few other tweaks num_v = sub( "([0-9]) ([0-9]) ([0-9]) ([0-9])", r"\1\2\3\4", num_v) if ctx.options.get("marc_field") in ("245", "830"): # TODO optimize without loop. - while match("[0-9] [0-9]", num_v): + while search("[0-9] [0-9]", num_v): num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v) + logger.debug(f"num_v: {num_v}") + logger.debug(f"text_v: {text_v}") output += num_v if use_num_v else text_v # if the end of the string is not reached, backtrack to the # delimiter after the last numerical token (i.e. two tokens - # ago) - - i = j - 2 if j < tk_ct - 1 else j + # ago). + # + # Else, we are at the end of the string, so we are done! + i = j - 1 if j < tk_ct - 1 else j break # this is run when we are not yet at the end of the string and # have not yet reached a non-numerical token. This is identical # to the code that is run above when the last token is numeric. - - if j % 2 == 0: - m = match(token_ptn, tk_j) - text_v += m[1] - num_v += m[2] if m[2] else m[1] - else: - text_v += tk_j - num_v += tk_j + m = search(token_ptn, tk_j) + text_v += m[1] + " " + num_v += m[2] if len(m[2]) else m[1] + num_v += " " else: - output += tk_i + logger.debug(f"No match: adding {tk_i}.") + output += tk_i + " " + + i += 1 + print(f"Use num version: {use_num_v}") ctx.dest = output diff --git a/scriptshifter/hooks/chinese/processNumbers.ts b/scriptshifter/hooks/chinese/processNumbers.ts new file mode 100644 index 0000000..691e5ab --- /dev/null +++ b/scriptshifter/hooks/chinese/processNumbers.ts @@ -0,0 +1,144 @@ +private processNumbers(pinyinString: string, tag: string, code: string): string { + let outputString = ""; + let useNumVersion = false; + //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers + if ((tag == "245" || tag == "830") && code == "n") { + useNumVersion = true; + } + + /* + * The input string is split, with any space or punctuation character (except for #) as the delimiter. + * The delimiters will be captured and included in the string of tokens. Only the even-numbered + * array elements are the true 'tokens', so the code for processing tokens is run only for even + * values of j. + */ + let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u")); + let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$"; + let numToken_re = new RegExp(numTokenPattern); + let n = tokens.length + //this.alert.info(tokens.join("|"),{autoClose: false}) + for (let i = 0; i < n; i++) { + let toki = tokens[i]; + if (toki.match(numToken_re)) { + /* + * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens + * found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the + * # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a + * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine + * which version should be used in the output string. The outer loop then continues where the inner loop left off. + */ + let textVersion = ""; + let numVersion = ""; + for (let j = i; j < n; j++) { + let tokj = tokens[j]; + /* a token without # (or the end of string) is reached */ + if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) { + //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version + let m = tokj.match(numToken_re); + if (m) { + textVersion += m[1] + if (m[2] == "") { + numVersion += m[1]; + } else { + numVersion += m[2]; + } + } else if (j == n - 1) { + //if last token is non-numerical, just tack it on. + textVersion += tokj; + numVersion += tokj; + } else if (textVersion.length > 0 && numVersion.length > 0) { + //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended + //(outer loop will pick up at this point) + textVersion = textVersion.substring(0, textVersion.length - 1); + numVersion = numVersion.substring(0, numVersion.length - 1); + } + //evaluate numerical string that has been constructed so far + //use num version for ordinals and date strings + if (numVersion.match(/^di [0-9]/i) || + numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) || + numVersion.match(/[0-9]+ nian [0-9]+ yue/i) || + numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) || + useNumVersion + ) { + useNumVersion = true; + /* + * At this point, string may contain literal translations of Chinese numerals + * Convert these to Arabic numerals (for example "2 10 7" = "27"). + */ + + while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) { + m = numVersion.match(/([0-9]+) ([1-9]0+)/); + if (m) { + let sum = Number(m[1]) * Number(m[2]); + numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum)); + } else { + let mb = numVersion.match(/([1-9]0+) ([0-9]+)/); + if (mb) + { + let sumb = Number(mb[1]) + Number(mb[2]); + numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb)); + } + else + { + break; + } + } + } + + //A few other tweaks + numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4"); + if ((tag == "245" || tag == "830") && code == "n") { + while (numVersion.match(/[0-9] [0-9]/)) { + numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2"); + } + } + } + if (useNumVersion) + { + outputString += numVersion; + } + else + { + outputString += textVersion; + } + //if the end of the string is not reached, backtrack to the delimiter after the last numerical token + //(i.e. two tokens ago) + if (j < n - 1) + { + i = j - 2; + } + else //we are at the end of the string, so we are done! + { + i = j; + } + break; + } + //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token + //This is identical to the code that is run above when the last token is numeric. + if (j % 2 == 0) + { + let m = tokj.match(numToken_re); + textVersion += m[1]; + if (m[2]== "") + { + numVersion += m[1]; + } + else + { + numVersion += m[2]; + } + } + else //a delimiter, just tack it on. + { + textVersion += tokj; + numVersion += tokj; + } + } + } + else // the outer loop has encountered a non-numeric token or delimiter, just tack it on. + { + outputString += toki; + } + } + return outputString; + }