-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
231 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
private processNumbers(pinyinString: string, tag: string, code: string): string { | ||
let outputString = ""; | ||
let useNumVersion = false; | ||
//useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers | ||
if ((tag == "245" || tag == "830") && code == "n") { | ||
useNumVersion = true; | ||
} | ||
|
||
/* | ||
* The input string is split, with any space or punctuation character (except for #) as the delimiter. | ||
* The delimiters will be captured and included in the string of tokens. Only the even-numbered | ||
* array elements are the true 'tokens', so the code for processing tokens is run only for even | ||
* values of j. | ||
*/ | ||
let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u")); | ||
let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$"; | ||
let numToken_re = new RegExp(numTokenPattern); | ||
let n = tokens.length | ||
//this.alert.info(tokens.join("|"),{autoClose: false}) | ||
for (let i = 0; i < n; i++) { | ||
let toki = tokens[i]; | ||
if (toki.match(numToken_re)) { | ||
/* | ||
* When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens | ||
* found after it. Two versions of the string are maintained. The textVersion is the original pinyin (minus the | ||
* # suffixes). In the numVersion, characters representing numbers are converted to Arabic numerals. When a | ||
* non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine | ||
* which version should be used in the output string. The outer loop then continues where the inner loop left off. | ||
*/ | ||
let textVersion = ""; | ||
let numVersion = ""; | ||
for (let j = i; j < n; j++) { | ||
let tokj = tokens[j]; | ||
/* a token without # (or the end of string) is reached */ | ||
if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) { | ||
//If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version | ||
let m = tokj.match(numToken_re); | ||
if (m) { | ||
textVersion += m[1] | ||
if (m[2] == "") { | ||
numVersion += m[1]; | ||
} else { | ||
numVersion += m[2]; | ||
} | ||
} else if (j == n - 1) { | ||
//if last token is non-numerical, just tack it on. | ||
textVersion += tokj; | ||
numVersion += tokj; | ||
} else if (textVersion.length > 0 && numVersion.length > 0) { | ||
//if not at end of string yet and token is non-numerical, remove the last delimiter that was appended | ||
//(outer loop will pick up at this point) | ||
textVersion = textVersion.substring(0, textVersion.length - 1); | ||
numVersion = numVersion.substring(0, numVersion.length - 1); | ||
} | ||
//evaluate numerical string that has been constructed so far | ||
//use num version for ordinals and date strings | ||
if (numVersion.match(/^di [0-9]/i) || | ||
numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) || | ||
numVersion.match(/[0-9]+ nian [0-9]+ yue/i) || | ||
numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) || | ||
useNumVersion | ||
) { | ||
useNumVersion = true; | ||
/* | ||
* At this point, string may contain literal translations of Chinese numerals | ||
* Convert these to Arabic numerals (for example "2 10 7" = "27"). | ||
*/ | ||
|
||
while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) { | ||
m = numVersion.match(/([0-9]+) ([1-9]0+)/); | ||
if (m) { | ||
let sum = Number(m[1]) * Number(m[2]); | ||
numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum)); | ||
} else { | ||
let mb = numVersion.match(/([1-9]0+) ([0-9]+)/); | ||
if (mb) | ||
{ | ||
let sumb = Number(mb[1]) + Number(mb[2]); | ||
numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb)); | ||
} | ||
else | ||
{ | ||
break; | ||
} | ||
} | ||
} | ||
|
||
//A few other tweaks | ||
numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4"); | ||
if ((tag == "245" || tag == "830") && code == "n") { | ||
while (numVersion.match(/[0-9] [0-9]/)) { | ||
numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2"); | ||
} | ||
} | ||
} | ||
if (useNumVersion) | ||
{ | ||
outputString += numVersion; | ||
} | ||
else | ||
{ | ||
outputString += textVersion; | ||
} | ||
//if the end of the string is not reached, backtrack to the delimiter after the last numerical token | ||
//(i.e. two tokens ago) | ||
if (j < n - 1) | ||
{ | ||
i = j - 2; | ||
} | ||
else //we are at the end of the string, so we are done! | ||
{ | ||
i = j; | ||
} | ||
break; | ||
} | ||
//this is run when we are not yet at the end of the string and have not yet reached a non-numerical token | ||
//This is identical to the code that is run above when the last token is numeric. | ||
if (j % 2 == 0) | ||
{ | ||
let m = tokj.match(numToken_re); | ||
textVersion += m[1]; | ||
if (m[2]== "") | ||
{ | ||
numVersion += m[1]; | ||
} | ||
else | ||
{ | ||
numVersion += m[2]; | ||
} | ||
} | ||
else //a delimiter, just tack it on. | ||
{ | ||
textVersion += tokj; | ||
numVersion += tokj; | ||
} | ||
} | ||
} | ||
else // the outer loop has encountered a non-numeric token or delimiter, just tack it on. | ||
{ | ||
outputString += toki; | ||
} | ||
} | ||
return outputString; | ||
} |