From b6f3f5e32d984862eb993f872794568d673ff4ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 3 Jan 2024 10:48:18 +0200 Subject: [PATCH 1/8] New branch, format with black --- src/wiktextract/extractor/en/page.py | 1713 +++++++++++++++----------- 1 file changed, 1024 insertions(+), 689 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 00e924b46..9aa6facd4 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -53,174 +53,177 @@ # Matches head tag HEAD_TAG_RE: Pattern = re.compile( - r"^(head|Han char|arabic-noun|arabic-noun-form|" - r"hangul-symbol|syllable-hangul)$|" + - r"^(latin|" + - "|".join(lang_code for lang_code, *_ in get_all_names("en")) + - r")-(" + - "|".join([ - "abbr", - "adj", - "adjective", - "adjective form", - "adjective-form", - "adv", - "adverb", - "affix", - "animal command", - "art", - "article", - "aux", - "bound pronoun", - "bound-pronoun", - "Buyla", - "card num", - "card-num", - "cardinal", - "chunom", - "classifier", - "clitic", - "cls", - "cmene", - "cmavo", - "colloq-verb", - "colverbform", - "combining form", - "combining-form", - "comparative", - "con", - "concord", - "conj", - "conjunction", - "conjug", - "cont", - "contr", - "converb", - "daybox", - "decl", - "decl noun", - "def", - "dem", - "det", - "determ", - "Deva", - "ending", - "entry", - "form", - "fuhivla", - "gerund", - "gismu", - "hanja", - "hantu", - "hanzi", - "head", - "ideophone", - "idiom", - "inf", - "indef", - "infixed pronoun", - "infixed-pronoun", - "infl", - "inflection", - "initialism", - "int", - "interfix", - "interj", - "interjection", - "jyut", - "latin", - "letter", - "locative", - "lujvo", - "monthbox", - "mutverb", - "name", - "nisba", - "nom", - "noun", - "noun form", - "noun-form", - "noun plural", - "noun-plural", - "nounprefix", - "num", - "number", - "numeral", - "ord", - "ordinal", - "par", - "part", - "part form", - "part-form", - "participle", - "particle", - "past", - "past neg", - "past-neg", - "past participle", - "past-participle", - "perfect participle", - "perfect-participle", - "personal pronoun", - "personal-pronoun", - "pref", - "prefix", - "phrase", - "pinyin", - "plural noun", - "plural-noun", - "pos", - "poss-noun", - "post", - "postp", - "postposition", - "PP", - "pp", - "ppron", - "pred", - "predicative", - "prep", - "prep phrase", - "prep-phrase", - "preposition", - "present participle", - "present-participle", - "pron", - "prondem", - "pronindef", - "pronoun", - "prop", - "proper noun", - "proper-noun", - "proper noun form", - "proper-noun form", - "proper noun-form", - "proper-noun-form", - "prov", - "proverb", - "prpn", - "prpr", - "punctuation mark", - "punctuation-mark", - "regnoun", - "rel", - "rom", - "romanji", - "root", - "sign", - "suff", - "suffix", - "syllable", - "symbol", - "verb", - "verb form", - "verb-form", - "verbal noun", - "verbal-noun", - "verbnec", - "vform", - ]) + - r")(-|/|\+|$)") + r"^(head|Han char|arabic-noun|arabic-noun-form|" + r"hangul-symbol|syllable-hangul)$|" + + r"^(latin|" + + "|".join(lang_code for lang_code, *_ in get_all_names("en")) + + r")-(" + + "|".join( + [ + "abbr", + "adj", + "adjective", + "adjective form", + "adjective-form", + "adv", + "adverb", + "affix", + "animal command", + "art", + "article", + "aux", + "bound pronoun", + "bound-pronoun", + "Buyla", + "card num", + "card-num", + "cardinal", + "chunom", + "classifier", + "clitic", + "cls", + "cmene", + "cmavo", + "colloq-verb", + "colverbform", + "combining form", + "combining-form", + "comparative", + "con", + "concord", + "conj", + "conjunction", + "conjug", + "cont", + "contr", + "converb", + "daybox", + "decl", + "decl noun", + "def", + "dem", + "det", + "determ", + "Deva", + "ending", + "entry", + "form", + "fuhivla", + "gerund", + "gismu", + "hanja", + "hantu", + "hanzi", + "head", + "ideophone", + "idiom", + "inf", + "indef", + "infixed pronoun", + "infixed-pronoun", + "infl", + "inflection", + "initialism", + "int", + "interfix", + "interj", + "interjection", + "jyut", + "latin", + "letter", + "locative", + "lujvo", + "monthbox", + "mutverb", + "name", + "nisba", + "nom", + "noun", + "noun form", + "noun-form", + "noun plural", + "noun-plural", + "nounprefix", + "num", + "number", + "numeral", + "ord", + "ordinal", + "par", + "part", + "part form", + "part-form", + "participle", + "particle", + "past", + "past neg", + "past-neg", + "past participle", + "past-participle", + "perfect participle", + "perfect-participle", + "personal pronoun", + "personal-pronoun", + "pref", + "prefix", + "phrase", + "pinyin", + "plural noun", + "plural-noun", + "pos", + "poss-noun", + "post", + "postp", + "postposition", + "PP", + "pp", + "ppron", + "pred", + "predicative", + "prep", + "prep phrase", + "prep-phrase", + "preposition", + "present participle", + "present-participle", + "pron", + "prondem", + "pronindef", + "pronoun", + "prop", + "proper noun", + "proper-noun", + "proper noun form", + "proper-noun form", + "proper noun-form", + "proper-noun-form", + "prov", + "proverb", + "prpn", + "prpr", + "punctuation mark", + "punctuation-mark", + "regnoun", + "rel", + "rom", + "romanji", + "root", + "sign", + "suff", + "suffix", + "syllable", + "symbol", + "verb", + "verb form", + "verb-form", + "verbal noun", + "verbal-noun", + "verbnec", + "vform", + ] + ) + + r")(-|/|\+|$)" +) FLOATING_TABLE_TEMPLATES: set[str] = { # az-suffix-form creates a style=floatright div that is otherwise @@ -439,8 +442,11 @@ "wtorw", } for x in PANEL_PREFIXES & wikipedia_templates: - print("WARNING: {!r} in both panel_templates and wikipedia_templates" - .format(x)) + print( + "WARNING: {!r} in both panel_templates and wikipedia_templates".format( + x + ) + ) # Mapping from a template name (without language prefix) for the main word # (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which @@ -482,8 +488,10 @@ for k, v in template_allowed_pos_map.items(): for x in v: if x not in PARTS_OF_SPEECH: - print("BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" - "".format(x, k, v)) + print( + "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" + "".format(x, k, v) + ) assert False @@ -526,9 +534,10 @@ # Regexp for matching ignored etymology template names. This adds certain # prefixes to the names listed above. ignored_etymology_templates_re = re.compile( - r"^((cite-|R:|RQ:).*|" + - r"|".join(re.escape(x) for x in ignored_etymology_templates) + - r")$") + r"^((cite-|R:|RQ:).*|" + + r"|".join(re.escape(x) for x in ignored_etymology_templates) + + r")$" +) # Regexp for matching ignored descendants template names. Right now we just # copy the ignored etymology templates @@ -655,11 +664,11 @@ def decode_html_entities(v: Union[str, int]) -> str: return html.unescape(v) -def parse_sense_linkage(wxr: - WiktextractContext, - data: WordData, - name: str, - ht: TemplateArgs, +def parse_sense_linkage( + wxr: WiktextractContext, + data: WordData, + name: str, + ht: TemplateArgs, ) -> None: """Parses a linkage (synonym, etc) specified in a word sense.""" assert isinstance(wxr, WiktextractContext) @@ -703,7 +712,7 @@ def parse_sense_linkage(wxr: alt = None m = re.search(r"\(([^)]+)\)$", w) if m: - w = w[:m.start()].strip() + w = w[: m.start()].strip() alt = m.group(1) dt = {"word": w} @@ -718,15 +727,15 @@ def parse_sense_linkage(wxr: data_append(data, field, dt) -def parse_language(wxr: WiktextractContext, - langnode: WikiNode, - language: str, - lang_code: str) -> list[WordData]: +def parse_language( + wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str +) -> list[WordData]: """Iterates over the text of the page, returning words (parts-of-speech) defined on the page one at a time. (Individual word senses for the same part-of-speech are typically encoded in the same entry.)""" # imported here to avoid circular import from wiktextract.pronunciations import parse_pronunciation + assert isinstance(wxr, WiktextractContext) assert isinstance(langnode, WikiNode) assert isinstance(language, str) @@ -737,15 +746,17 @@ def parse_language(wxr: WiktextractContext, word = wxr.wtp.title unsupported_prefix = "Unsupported titles/" if word.startswith(unsupported_prefix): - w = word[len(unsupported_prefix):] + w = word[len(unsupported_prefix) :] if w in unsupported_title_map: word = unsupported_title_map[w] else: - wxr.wtp.error("Unimplemented unsupported title: {}".format(word), - sortid="page/870") + wxr.wtp.error( + "Unimplemented unsupported title: {}".format(word), + sortid="page/870", + ) word = w elif word.startswith("Reconstruction:"): - word = word[word.find("/") + 1:] + word = word[word.find("/") + 1 :] is_reconstruction = True base_data = {"word": word, "lang": language, "lang_code": lang_code} @@ -771,14 +782,16 @@ def merge_base(data, base): continue if data[k] == v: continue - if (isinstance(data[k], (list, tuple)) or - isinstance(v, (list, tuple))): + if isinstance(data[k], (list, tuple)) or isinstance( + v, (list, tuple) + ): data[k] = list(data[k]) + list(v) elif data[k] != v: - wxr.wtp.warning("conflicting values for {} in merge_base: " - "{!r} vs {!r}" - .format(k, data[k], v), - sortid="page/904") + wxr.wtp.warning( + "conflicting values for {} in merge_base: " + "{!r} vs {!r}".format(k, data[k], v), + sortid="page/904", + ) def complementary_pop(pron, key): """Remove unnecessary keys from dict values @@ -792,30 +805,41 @@ def complementary_pop(pron, key): if "sounds" in data and "word" in data: accepted = [data["word"]] accepted.extend(f["form"] for f in data.get("forms", ())) - data["sounds"] = list(complementary_pop(s, "pos") - for s in data["sounds"] - if "form" not in s or s["form"] in accepted) + data["sounds"] = list( + complementary_pop(s, "pos") + for s in data["sounds"] + if "form" not in s or s["form"] in accepted + ) # If the result has sounds, eliminate sounds that have a pos that # does not match "pos" if "sounds" in data and "pos" in data: - data["sounds"] = list(s for s in data["sounds"] - if "pos" not in s or s["pos"] == data["pos"]) + data["sounds"] = list( + s + for s in data["sounds"] + if "pos" not in s or s["pos"] == data["pos"] + ) def push_sense(): """Starts collecting data for a new word sense. This returns True if a sense was added.""" nonlocal sense_data tags = sense_data.get("tags", ()) - if (not sense_data.get("glosses") and - "translation-hub" not in tags and - "no-gloss" not in tags): + if ( + not sense_data.get("glosses") + and "translation-hub" not in tags + and "no-gloss" not in tags + ): return False - if (("participle" in sense_data.get("tags", ()) or - "infinitive" in sense_data.get("tags", ())) and - "alt_of" not in sense_data and - "form_of" not in sense_data and - "etymology_text" in etym_data): + if ( + ( + "participle" in sense_data.get("tags", ()) + or "infinitive" in sense_data.get("tags", ()) + ) + and "alt_of" not in sense_data + and "form_of" not in sense_data + and "etymology_text" in etym_data + ): etym = etym_data["etymology_text"] etym = etym.split(". ")[0] ret = parse_alt_or_inflection_of(wxr, etym, set()) @@ -829,8 +853,9 @@ def push_sense(): data_extend(sense_data, "alt_of", lst) data_extend(sense_data, "tags", tags) - if (not sense_data.get("glosses") and - "no-gloss" not in sense_data.get("tags", ())): + if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( + "tags", () + ): data_append(sense_data, "tags", "no-gloss") pos_datas.append(sense_data) @@ -965,13 +990,13 @@ def parse_part_of_speech(posnode, pos): floaters, poschildren = recursively_extract( posnode.children, lambda x: ( - isinstance(x, WikiNode) and - x.kind == NodeKind.TEMPLATE and - x.largs[0][0] in FLOATING_TABLE_TEMPLATES - ) + isinstance(x, WikiNode) + and x.kind == NodeKind.TEMPLATE + and x.largs[0][0] in FLOATING_TABLE_TEMPLATES + ), ) tempnode = WikiNode(NodeKind.LEVEL5, 0) - tempnode.largs = ['Inflection'] + tempnode.largs = ["Inflection"] tempnode.children = floaters parse_inflection(tempnode, "Floating Div", pos) # print(poschildren) @@ -981,12 +1006,12 @@ def parse_part_of_speech(posnode, pos): if not floaters: wxr.wtp.debug( "PoS section without contents", - sortid="en/page/1051/20230612" + sortid="en/page/1051/20230612", ) else: wxr.wtp.debug( "PoS section without contents except for a floating table", - sortid="en/page/1056/20230612" + sortid="en/page/1056/20230612", ) return @@ -1019,16 +1044,19 @@ def parse_part_of_speech(posnode, pos): elif collecting_head and kind == NodeKind.LINK: # We might collect relevant links as they are often pictures # relating to the word - if (len(node.largs[0]) >= 1 and - isinstance(node.largs[0][0], str)): - if node.largs[0][0].startswith(ns_title_prefix_tuple( - wxr, "Category")): + if len(node.largs[0]) >= 1 and isinstance( + node.largs[0][0], str + ): + if node.largs[0][0].startswith( + ns_title_prefix_tuple(wxr, "Category") + ): # [[Category:...]] # We're at the end of the file, probably, so stop # here. Otherwise the head will get garbage. break - if node.largs[0][0].startswith(ns_title_prefix_tuple( - wxr, "File")): + if node.largs[0][0].startswith( + ns_title_prefix_tuple(wxr, "File") + ): # Skips file links continue start_of_paragraph = False @@ -1040,8 +1068,12 @@ def parse_part_of_speech(posnode, pos): lists.append([]) # Lists parallels pre collecting_head = True start_of_paragraph = True - elif (collecting_head and - node.sarg not in ("gallery", "ref", "cite", "caption")): + elif collecting_head and node.sarg not in ( + "gallery", + "ref", + "cite", + "caption", + ): start_of_paragraph = False pre[-1].append(node) else: @@ -1065,17 +1097,17 @@ def parse_part_of_speech(posnode, pos): continue # skip these templates # if node.largs[0][0] in skip_these_templates_in_head: - # first_head_tmplt = False # no first_head_tmplt at all - # start_of_paragraph = False - # continue + # first_head_tmplt = False # no first_head_tmplt at all + # start_of_paragraph = False + # continue if first_head_tmplt and pre[-1]: first_head_tmplt = False start_of_paragraph = False pre[-1].append(node) elif pre[-1] and start_of_paragraph: - pre.append([]) # Switch to the next head - lists.append([]) # lists parallel pre + pre.append([]) # Switch to the next head + lists.append([]) # lists parallel pre collecting_head = True start_of_paragraph = False pre[-1].append(node) @@ -1102,8 +1134,9 @@ def parse_part_of_speech(posnode, pos): if not pre1 and not ls: # skip [] + [] continue - if not ls and all((isinstance(x, str) and not x.strip()) - for x in pre1): + if not ls and all( + (isinstance(x, str) and not x.strip()) for x in pre1 + ): # skip ["\n", " "] + [] continue if ls and not pre1: @@ -1128,60 +1161,75 @@ def parse_part_of_speech(posnode, pos): # # don't have gloss list # # XXX add code here to filter out 'garbage', like text # # that isn't a head template or head. - # continue + # continue if all(not sl for sl in lists[i:]): if i == 0: if isinstance(node, str): - wxr.wtp.debug("first head without list of senses," - "string: '{}[...]', {}/{}".format( - node[:20], word, language), - sortid="page/1689/20221215") + wxr.wtp.debug( + "first head without list of senses," + "string: '{}[...]', {}/{}".format( + node[:20], word, language + ), + sortid="page/1689/20221215", + ) if isinstance(node, WikiNode): - if node.largs and node.largs[0][0] in ["Han char",]: + if node.largs and node.largs[0][0] in [ + "Han char", + ]: # just ignore these templates pass else: - wxr.wtp.debug("first head without " - "list of senses, " - "template node " - "{}, {}/{}".format( - node.largs, word, language), - sortid="page/1694/20221215") + wxr.wtp.debug( + "first head without " + "list of senses, " + "template node " + "{}, {}/{}".format( + node.largs, word, language + ), + sortid="page/1694/20221215", + ) else: - wxr.wtp.debug("first head without list of senses, " - "{}/{}".format( - word, language), - sortid="page/1700/20221215") + wxr.wtp.debug( + "first head without list of senses, " + "{}/{}".format(word, language), + sortid="page/1700/20221215", + ) # no break here so that the first head always # gets processed. else: if isinstance(node, str): - wxr.wtp.debug("later head without list of senses," - "string: '{}[...]', {}/{}".format( - node[:20], word, language), - sortid="page/1708/20221215") + wxr.wtp.debug( + "later head without list of senses," + "string: '{}[...]', {}/{}".format( + node[:20], word, language + ), + sortid="page/1708/20221215", + ) if isinstance(node, WikiNode): - wxr.wtp.debug("later head without list of senses," - "template node " - "{}, {}/{}".format( - node.sarg if node.sarg else node.largs, - word, language), - sortid="page/1713/20221215") + wxr.wtp.debug( + "later head without list of senses," + "template node " + "{}, {}/{}".format( + node.sarg if node.sarg else node.largs, + word, + language, + ), + sortid="page/1713/20221215", + ) else: - wxr.wtp.debug("later head without list of senses, " - "{}/{}".format( - word, language), - sortid="page/1719/20221215") + wxr.wtp.debug( + "later head without list of senses, " + "{}/{}".format(word, language), + sortid="page/1719/20221215", + ) break head_group = i + 1 if there_are_many_heads else None # print("parse_part_of_speech: {}: {}: pre={}" - # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) - process_gloss_header(pre1, - pos, - head_group, - pos_data, - header_tags) + # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) + process_gloss_header( + pre1, pos, head_group, pos_data, header_tags + ) for l in ls: # Parse each list associated with this head. for node in l.children: @@ -1223,7 +1271,7 @@ def process_gloss_header( exp.children, lambda x: isinstance(x, WikiNode) and x.kind == NodeKind.HTML - and x.sarg == "ruby" + and x.sarg == "ruby", ) if rub is not None: for r in rub: @@ -1262,9 +1310,8 @@ def process_gloss_without_list( if isinstance(node, WikiNode): if node.kind == NodeKind.TEMPLATE: template_name = node.largs[0][0] - if ( - template_name == "head" - or template_name.startswith(f"{lang_code}-") + if template_name == "head" or template_name.startswith( + f"{lang_code}-" ): header_nodes.append(node) continue @@ -1292,14 +1339,17 @@ def parse_sense_node(node, sense_base, pos): """ assert isinstance(sense_base, dict) # Added to every sense deeper in if not isinstance(node, WikiNode): - wxr.wtp.debug("{}: parse_sense_node called with" - "something that isn't a WikiNode".format(pos), - sortid="page/1287/20230119") + wxr.wtp.debug( + "{}: parse_sense_node called with" + "something that isn't a WikiNode".format(pos), + sortid="page/1287/20230119", + ) return False if node.kind != NodeKind.LIST_ITEM: - wxr.wtp.debug("{}: non-list-item inside list".format(pos), - sortid="page/1678") + wxr.wtp.debug( + "{}: non-list-item inside list".format(pos), sortid="page/1678" + ) return False if node.sarg == ":": @@ -1330,26 +1380,34 @@ def parse_sense_node(node, sense_base, pos): # of subglosses below this. The list's # argument ends with #, and its depth should # be bigger than parent node. - subentries = [x for x in children - if isinstance(x, WikiNode) and - x.kind == NodeKind.LIST and - x.sarg == current_depth + "#"] + subentries = [ + x + for x in children + if isinstance(x, WikiNode) + and x.kind == NodeKind.LIST + and x.sarg == current_depth + "#" + ] # sublists of examples and quotations. .sarg # does not end with "#". - others = [x for x in children - if isinstance(x, WikiNode) and - x.kind == NodeKind.LIST and - x.sarg != current_depth + "#"] + others = [ + x + for x in children + if isinstance(x, WikiNode) + and x.kind == NodeKind.LIST + and x.sarg != current_depth + "#" + ] # the actual contents of this particular node. # can be a gloss (or a template that expands into # many glosses which we can't easily pre-expand) # or could be an "outer gloss" with more specific # subglosses, or could be a qualfier for the subglosses. - contents = [x for x in children - if not isinstance(x, WikiNode) or - x.kind != NodeKind.LIST] + contents = [ + x + for x in children + if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST + ] # If this entry has sublists of entries, we should combine # gloss information from both the "outer" and sublist content. # Sometimes the outer gloss @@ -1371,28 +1429,29 @@ def parse_sense_node(node, sense_base, pos): # copy current node and modify it so it doesn't # loop infinitely. cropped_node = copy.copy(node) - cropped_node.children = [x for x in children - if not (isinstance(x, WikiNode) and - x.kind == NodeKind.LIST and - x.sarg == current_depth + "#")] - added |= parse_sense_node(cropped_node, - sense_base, - pos) + cropped_node.children = [ + x + for x in children + if not ( + isinstance(x, WikiNode) + and x.kind == NodeKind.LIST + and x.sarg == current_depth + "#" + ) + ] + added |= parse_sense_node(cropped_node, sense_base, pos) nonlocal sense_data # this kludge causes duplicated raw_ - # glosses data if this is not done; - # if the top-level (cropped_node) - # does not push_sense() properly or - # parse_sense_node() returns early, - # sense_data is not reset. This happens - # for example when you have a no-gloss - # string like "(intransitive)": - # no gloss, push_sense() returns early - # and sense_data has duplicate data with - # sense_base + # glosses data if this is not done; + # if the top-level (cropped_node) + # does not push_sense() properly or + # parse_sense_node() returns early, + # sense_data is not reset. This happens + # for example when you have a no-gloss + # string like "(intransitive)": + # no gloss, push_sense() returns early + # and sense_data has duplicate data with + # sense_base sense_data = {} - added |= parse_sense_node(slc[0], - sense_base, - pos) + added |= parse_sense_node(slc[0], sense_base, pos) return added return process_gloss_contents( @@ -1430,8 +1489,7 @@ def sense_template_fn( arg = clean_node(wxr, sense_base, ht.get(2, ())) if re.match(r"Q\d+$", arg): data_append(sense_base, "wikidata", arg) - data_append(sense_base, "senseid", - langid + ":" + arg) + data_append(sense_base, "senseid", langid + ":" + arg) if name in sense_linkage_templates: # print(f"SENSE_TEMPLATE_FN: {name}") parse_sense_linkage(wxr, sense_base, name, ht) @@ -1470,7 +1528,7 @@ def sense_template_fn( if is_gloss: wxr.wtp.warning( "Example template is used for gloss text", - sortid="extractor.en.page.sense_template_fn/1415" + sortid="extractor.en.page.sense_template_fn/1415", ) else: return "" @@ -1504,8 +1562,11 @@ def extract_link_texts(item): return if item.kind == NodeKind.LINK: v = item.largs[-1] - if (isinstance(v, list) and len(v) == 1 and - isinstance(v[0], str)): + if ( + isinstance(v, list) + and len(v) == 1 + and isinstance(v[0], str) + ): gloss_template_args.add(v[0].strip()) for x in item.children: extract_link_texts(x) @@ -1542,7 +1603,7 @@ def extract_link_texts(item): strip_ends = [", particularly:"] for x in strip_ends: if rawgloss.endswith(x): - rawgloss = rawgloss[:-len(x)] + rawgloss = rawgloss[: -len(x)] break # The gloss could contain templates that produce more list items. @@ -1562,10 +1623,10 @@ def extract_link_texts(item): if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): data_append(sense_base, "raw_glosses", subglosses[1]) m = re.match(r"\(([^()]+)\):?\s*", rawgloss) - # ( ..\1.. ): ... or ( ..\1.. ) ... + # ( ..\1.. ): ... or ( ..\1.. ) ... if m: q = m.group(1) - rawgloss = rawgloss[m.end():].strip() + rawgloss = rawgloss[m.end() :].strip() parse_sense_qualifier(wxr, q, sense_base) if rawgloss == "A pejorative:": data_append(sense_base, "tags", "pejorative") @@ -1583,15 +1644,20 @@ def extract_link_texts(item): # The main recursive call (except for the exceptions at the # start of this function). for sublist in subentries: - if not (isinstance(sublist, WikiNode) and - sublist.kind == NodeKind.LIST): - wxr.wtp.debug(f"'{repr(rawgloss[:20])}.' gloss has `subentries`" - f"with items that are not LISTs", - sortid="page/1511/20230119") + if not ( + isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST + ): + wxr.wtp.debug( + f"'{repr(rawgloss[:20])}.' gloss has `subentries`" + f"with items that are not LISTs", + sortid="page/1511/20230119", + ) continue for item in sublist.children: - if not (isinstance(item, WikiNode) and - item.kind == NodeKind.LIST_ITEM): + if not ( + isinstance(item, WikiNode) + and item.kind == NodeKind.LIST_ITEM + ): continue # copy sense_base to prevent cross-contamination between # subglosses and other subglosses and superglosses @@ -1611,20 +1677,22 @@ def extract_link_texts(item): if added: if examples: # this higher-up gloss has examples that we do not want to skip - wxr.wtp.debug("'{}[...]' gloss has examples we want to keep, " - "but there are subglosses." - .format(repr(rawgloss[:30])), - sortid="page/1498/20230118") + wxr.wtp.debug( + "'{}[...]' gloss has examples we want to keep, " + "but there are subglosses.".format(repr(rawgloss[:30])), + sortid="page/1498/20230118", + ) else: return True # Some entries, e.g., "iacebam", have weird sentences in quotes # after the gloss, but these sentences don't seem to be intended # as glosses. Skip them. - subglosses = list(gl for gl in subglosses - if gl.strip() and - not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', - gl)) + subglosses = list( + gl + for gl in subglosses + if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) + ) if len(subglosses) > 1 and "form_of" not in sense_base: gl = subglosses[0].strip() @@ -1633,8 +1701,7 @@ def extract_link_texts(item): parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) if parsed is not None: infl_tags, infl_dts = parsed - if (infl_dts and "form-of" in infl_tags and - len(infl_tags) == 1): + if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: # Interpret others as a particular form under # "inflection of" data_extend(sense_base, "tags", infl_tags) @@ -1680,10 +1747,10 @@ def extract_link_texts(item): sense_data[k] = v # Parse the gloss for this particular sense m = re.match(r"^\((([^()]|\([^()]*\))*)\):?\s*", gloss) - # (...): ... or (...(...)...): ... + # (...): ... or (...(...)...): ... if m: parse_sense_qualifier(wxr, m.group(1), sense_data) - gloss = gloss[m.end():].strip() + gloss = gloss[m.end() :].strip() # Remove common suffix "[from 14th c.]" and similar gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) @@ -1691,12 +1758,15 @@ def extract_link_texts(item): # Check to make sure we don't have unhandled list items in gloss ofs = max(gloss.find("#"), gloss.find("* ")) if ofs > 10 and "(#)" not in gloss: - wxr.wtp.debug("gloss may contain unhandled list items: {}" - .format(gloss), - sortid="page/1412") + wxr.wtp.debug( + "gloss may contain unhandled list items: {}".format(gloss), + sortid="page/1412", + ) elif "\n" in gloss: - wxr.wtp.debug("gloss contains newline: {}".format(gloss), - sortid="page/1416") + wxr.wtp.debug( + "gloss contains newline: {}".format(gloss), + sortid="page/1416", + ) # Kludge, some glosses have a comma after initial qualifiers in # parentheses @@ -1706,7 +1776,7 @@ def extract_link_texts(item): if gloss.endswith(":"): gloss = gloss[:-1].strip() if gloss.startswith("N. of "): - gloss = "Name of " + gloss[6:] + gloss = "Name of " + gloss[6:] if gloss.startswith("†"): data_append(sense_data, "tags", "obsolete") gloss = gloss[1:] @@ -1729,16 +1799,19 @@ def extract_link_texts(item): if tag not in sense_tags: data_append(sense_data, "tags", tag) if countability_tags: - if ("countable" not in sense_tags and - "uncountable" not in sense_tags): + if ( + "countable" not in sense_tags + and "uncountable" not in sense_tags + ): data_extend(sense_data, "tags", countability_tags) # If outer gloss specifies a form-of ("inflection of", see # aquamarine/German), try to parse the inner glosses as # tags for an inflected form. if "form-of" in sense_base.get("tags", ()): - parsed = parse_alt_or_inflection_of(wxr, gloss, - gloss_template_args) + parsed = parse_alt_or_inflection_of( + wxr, gloss, gloss_template_args + ) if parsed is not None: infl_tags, infl_dts = parsed if not infl_dts and infl_tags: @@ -1758,18 +1831,23 @@ def extract_link_texts(item): split_glosses = [] for m in re.finditer(r"Abbreviation of ", gloss): if m.start() != position: - split_glosses.append(gloss[position: m.start()]) + split_glosses.append(gloss[position : m.start()]) position = m.start() split_glosses.append(gloss[position:]) for gloss in split_glosses: # Check if this gloss describes an alt-of or inflection-of - if (lang_code != "en" and " " not in gloss and distw([word], gloss) < 0.3): + if ( + lang_code != "en" + and " " not in gloss + and distw([word], gloss) < 0.3 + ): # Don't try to parse gloss if it is one word # that is close to the word itself for non-English words # (probable translations of a tag/form name) continue - parsed = parse_alt_or_inflection_of(wxr, gloss, - gloss_template_args) + parsed = parse_alt_or_inflection_of( + wxr, gloss, gloss_template_args + ) if parsed is None: continue tags, dts = parsed @@ -1818,8 +1896,9 @@ def parse_inflection(node, section, pos): # print("parse_inflection:", node) if pos is None: - wxr.wtp.debug("inflection table outside part-of-speech", - sortid="page/1812") + wxr.wtp.debug( + "inflection table outside part-of-speech", sortid="page/1812" + ) return def inflection_template_fn(name, ht): @@ -1830,8 +1909,11 @@ def inflection_template_fn(name, ht): # These are not to be captured as an exception to the # generic code below return None - m = re.search(r"-(conj|decl|ndecl|adecl|infl|conjugation|" - r"declension|inflection|mut|mutation)($|-)", name) + m = re.search( + r"-(conj|decl|ndecl|adecl|infl|conjugation|" + r"declension|inflection|mut|mutation)($|-)", + name, + ) if m: args_ht = clean_template_args(wxr, ht) dt = {"name": name, "args": args_ht} @@ -1844,7 +1926,7 @@ def inflection_template_fn(name, ht): text = wxr.wtp.node_to_wikitext(node.children) # Split text into separate sections for each to-level template - brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] + brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] template_sections = [] template_nesting = 0 # depth of SINGLE BRACES { { nesting } } # Because there is the possibility of triple curly braces @@ -1862,14 +1944,13 @@ def inflection_template_fn(name, ht): if len(brace_matches) > 1: tsection = [] after_templates = False # kludge to keep any text - # before first template - # with the first template; - # otherwise, text - # goes with preceding template + # before first template + # with the first template; + # otherwise, text + # goes with preceding template for m in brace_matches: if m.startswith("{{"): - if (template_nesting == 0 and - after_templates): + if template_nesting == 0 and after_templates: template_sections.append(tsection) tsection = [] # start new section @@ -1879,12 +1960,13 @@ def inflection_template_fn(name, ht): elif m.startswith("}}"): template_nesting -= len(m) if template_nesting < 0: - wxr.wtp.error("Negatively nested braces, " - "couldn't split inflection templates, " - "{}/{} section {}" - .format(word, language, section), - sortid="page/1871") - template_sections = [] # use whole text + wxr.wtp.error( + "Negatively nested braces, " + "couldn't split inflection templates, " + "{}/{} section {}".format(word, language, section), + sortid="page/1871", + ) + template_sections = [] # use whole text break tsection.append(m) else: @@ -1904,16 +1986,20 @@ def inflection_template_fn(name, ht): for tsection in template_sections: texts.append("".join(tsection)) if template_nesting != 0: - wxr.wtp.error("Template nesting error: " - "template_nesting = {} " - "couldn't split inflection templates, " - "{}/{} section {}" - .format(template_nesting, word, language, section), - sortid="page/1896") + wxr.wtp.error( + "Template nesting error: " + "template_nesting = {} " + "couldn't split inflection templates, " + "{}/{} section {}".format( + template_nesting, word, language, section + ), + sortid="page/1896", + ) texts = [text] for text in texts: - tree = wxr.wtp.parse(text, expand_all=True, - template_fn=inflection_template_fn) + tree = wxr.wtp.parse( + text, expand_all=True, template_fn=inflection_template_fn + ) # Parse inflection tables from the section. The data is stored # under "forms". @@ -1924,10 +2010,16 @@ def inflection_template_fn(name, ht): template_name = m.group(1) tablecontext = TableContext(template_name) - parse_inflection_section(wxr, pos_data, - word, language, - pos, section, tree, - tablecontext=tablecontext) + parse_inflection_section( + wxr, + pos_data, + word, + language, + pos, + section, + tree, + tablecontext=tablecontext, + ) def get_subpage_section(title, subtitle, seq): """Loads a subpage of the given page, and finds the section @@ -1942,9 +2034,11 @@ def get_subpage_section(title, subtitle, seq): subpage_title = word + "/" + subtitle subpage_content = wxr.wtp.get_page_body(subpage_title, 0) if subpage_content is None: - wxr.wtp.error("/translations not found despite " - "{{see translation subpage|...}}", - sortid="page/1934") + wxr.wtp.error( + "/translations not found despite " + "{{see translation subpage|...}}", + sortid="page/1934", + ) def recurse(node, seq): # print(f"seq: {seq}") @@ -1970,14 +2064,17 @@ def recurse(node, seq): subpage_content, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES + do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, ) assert tree.kind == NodeKind.ROOT ret = recurse(tree, seq) if ret is None: - wxr.wtp.debug("Failed to find subpage section {}/{} seq {}" - .format(title, subtitle, seq), - sortid="page/1963") + wxr.wtp.debug( + "Failed to find subpage section {}/{} seq {}".format( + title, subtitle, seq + ), + sortid="page/1963", + ) return ret def parse_linkage(data, field, linkagenode): @@ -2027,19 +2124,28 @@ def item_recurse(contents, italic=False): sense1 = sense1[:-1].strip() if sense1.startswith("(") and sense1.endswith(")"): sense1 = sense1[1:-1].strip() - if sense1.lower() == wxr.config.OTHER_SUBTITLES["translations"]: + if ( + sense1.lower() + == wxr.config.OTHER_SUBTITLES["translations"] + ): sense1 = None # print("linkage item_recurse LIST sense1:", sense1) - parse_linkage_recurse(node.children, field, - sense=sense1 or sense) + parse_linkage_recurse( + node.children, field, sense=sense1 or sense + ) parts = [] else: parse_linkage_recurse(node.children, field, sense) - elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW, - NodeKind.TABLE_CELL): + elif kind in ( + NodeKind.TABLE, + NodeKind.TABLE_ROW, + NodeKind.TABLE_CELL, + ): parse_linkage_recurse(node.children, field, sense) - elif kind in (NodeKind.TABLE_HEADER_CELL, - NodeKind.TABLE_CAPTION): + elif kind in ( + NodeKind.TABLE_HEADER_CELL, + NodeKind.TABLE_CAPTION, + ): continue elif kind == NodeKind.HTML: classes = (node.attrs.get("class") or "").split() @@ -2066,17 +2172,19 @@ def item_recurse(contents, italic=False): ignore = False if isinstance(node.largs[0][0], str): v = node.largs[0][0].strip().lower() - if v.startswith(ns_title_prefix_tuple(wxr, - "Category", True) \ - + ns_title_prefix_tuple(wxr, - "File", True)): + if v.startswith( + ns_title_prefix_tuple(wxr, "Category", True) + + ns_title_prefix_tuple(wxr, "File", True) + ): ignore = True if not ignore: v = node.largs[-1] - if (len(node.largs) == 1 and - len(v) > 0 and - isinstance(v[0], str) and - v[0][0] == ":"): + if ( + len(node.largs) == 1 + and len(v) > 0 + and isinstance(v[0], str) + and v[0][0] == ":" + ): v = [v[0][1:]] + list(v[1:]) item_recurse(v, italic=italic) elif kind == NodeKind.URL: @@ -2093,9 +2201,12 @@ def item_recurse(contents, italic=False): elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): item_recurse(node.children, italic=italic) else: - wxr.wtp.debug("linkage item_recurse unhandled {}: {}" - .format(node.kind, node), - sortid="page/2073") + wxr.wtp.debug( + "linkage item_recurse unhandled {}: {}".format( + node.kind, node + ), + sortid="page/2073", + ) # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" # .format(contents)) @@ -2105,9 +2216,18 @@ def item_recurse(contents, italic=False): # print("CLEANED ITEM: {!r}".format(item)) # print(f"URLS {urls=!r}") - return parse_linkage_item_text(wxr, word, data, field, item, - sense, ruby, pos_datas, - is_reconstruction, urls) + return parse_linkage_item_text( + wxr, + word, + data, + field, + item, + sense, + ruby, + pos_datas, + is_reconstruction, + urls, + ) def parse_linkage_template(node): nonlocal have_panel_template @@ -2123,11 +2243,15 @@ def linkage_template_fn(name, ht): have_panel_template = True return "" for prefix, t in template_linkage_mappings: - if re.search(r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), - name): + if re.search( + r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name + ): f = t if isinstance(t, str) else field - if (name.endswith("-top") or name.endswith("-bottom") or - name.endswith("-mid")): + if ( + name.endswith("-top") + or name.endswith("-bottom") + or name.endswith("-mid") + ): field = f return "" i = t if isinstance(t, int) else 2 @@ -2144,8 +2268,9 @@ def linkage_template_fn(name, ht): # Main body of parse_linkage_template() text = wxr.wtp.node_to_wikitext(node) - parsed = wxr.wtp.parse(text, expand_all=True, - template_fn=linkage_template_fn) + parsed = wxr.wtp.parse( + text, expand_all=True, template_fn=linkage_template_fn + ) parse_linkage_recurse(parsed.children, field, None) def parse_linkage_recurse(contents, field, sense): @@ -2177,9 +2302,12 @@ def parse_linkage_recurse(contents, field, sense): parse_linkage_recurse(node.children, field, sense) elif kind == NodeKind.TABLE_CELL: parse_linkage_item(node.children, field, sense) - elif kind in (NodeKind.TABLE_CAPTION, - NodeKind.TABLE_HEADER_CELL, - NodeKind.PREFORMATTED, NodeKind.BOLD): + elif kind in ( + NodeKind.TABLE_CAPTION, + NodeKind.TABLE_HEADER_CELL, + NodeKind.PREFORMATTED, + NodeKind.BOLD, + ): continue elif kind == NodeKind.HTML: # Recurse to process inside the HTML for most tags @@ -2196,16 +2324,18 @@ def parse_linkage_recurse(contents, field, sense): if sense1.endswith(":"): sense1 = sense1[:-1].strip() if sense and sense1: - wxr.wtp.debug("linkage qualifier-content on multiple " - "levels: {!r} and {!r}" - .format(sense, sense1), - sortid="page/2170") + wxr.wtp.debug( + "linkage qualifier-content on multiple " + "levels: {!r} and {!r}".format(sense, sense1), + sortid="page/2170", + ) parse_linkage_recurse(node.children, field, sense1) elif "NavFrame" in classes: # NavFrame uses previously assigned next_navframe_sense # (from a "(sense):" item) and clears it afterwards - parse_linkage_recurse(node.children, field, - sense or next_navframe_sense) + parse_linkage_recurse( + node.children, field, sense or next_navframe_sense + ) next_navframe_sense = None else: parse_linkage_recurse(node.children, field, sense) @@ -2222,9 +2352,12 @@ def parse_linkage_recurse(contents, field, sense): # initial value parse_linkage_recurse(node.largs[-1], field, sense) else: - wxr.wtp.debug("parse_linkage_recurse unhandled {}: {}" - .format(kind, node), - sortid="page/2196") + wxr.wtp.debug( + "parse_linkage_recurse unhandled {}: {}".format( + kind, node + ), + sortid="page/2196", + ) def linkage_template_fn1(name, ht): nonlocal have_panel_template @@ -2239,10 +2372,14 @@ def parse_zh_synonyms(parsed, data, hdrs, root_word): if isinstance(item, WikiNode): if item.kind == NodeKind.TABLE_ROW: cleaned = clean_node(wxr, None, item.children) - #print("cleaned:", repr(cleaned)) - if any(["Variety" in cleaned, - "Location" in cleaned, - "Words" in cleaned]): + # print("cleaned:", repr(cleaned)) + if any( + [ + "Variety" in cleaned, + "Location" in cleaned, + "Words" in cleaned, + ] + ): pass else: split = cleaned.split("\n") @@ -2268,11 +2405,15 @@ def parse_zh_synonyms(parsed, data, hdrs, root_word): if tag in zh_tag_lookup: tags.extend(zh_tag_lookup[tag]) else: - print(f"MISSING ZH SYNONYM TAG for root {root_word}, word {words}: {tag}") + print( + f"MISSING ZH SYNONYM TAG for root {root_word}, word {words}: {tag}" + ) sys.stdout.flush() for word in words: - data.append({"word": word.strip(), "tags": tags}) + data.append( + {"word": word.strip(), "tags": tags} + ) elif item.kind == NodeKind.HTML: cleaned = clean_node(wxr, None, item.children) if "Synonyms of" in cleaned: @@ -2288,10 +2429,14 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word): if isinstance(item, WikiNode): if item.kind == NodeKind.LIST_ITEM: cleaned = clean_node(wxr, None, item.children) - #print("cleaned:", repr(cleaned)) - if any(["Variety" in cleaned, - "Location" in cleaned, - "Words" in cleaned]): + # print("cleaned:", repr(cleaned)) + if any( + [ + "Variety" in cleaned, + "Location" in cleaned, + "Words" in cleaned, + ] + ): pass else: cleaned = cleaned.replace("(", ",") @@ -2309,11 +2454,15 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word): tags.append(tag) elif tag in zh_tag_lookup: tags.extend(zh_tag_lookup[tag]) - elif classify_desc(tag) == "romanization" \ - and roman is None: + elif ( + classify_desc(tag) == "romanization" + and roman is None + ): roman = tag else: - print(f"MISSING ZH SYNONYM TAG (possibly pinyin) - root {root_word}, word {words}: {tag}") + print( + f"MISSING ZH SYNONYM TAG (possibly pinyin) - root {root_word}, word {words}: {tag}" + ) sys.stdout.flush() for word in words: @@ -2328,9 +2477,13 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word): if cleaned.find("Synonyms of") >= 0: cleaned = cleaned.replace("Synonyms of ", "") root_word = cleaned - parse_zh_synonyms_list(item.children, data, hdrs, root_word) + parse_zh_synonyms_list( + item.children, data, hdrs, root_word + ) else: - parse_zh_synonyms_list(item.children, data, hdrs, root_word) + parse_zh_synonyms_list( + item.children, data, hdrs, root_word + ) def contains_kind(children, nodekind): assert isinstance(children, list) @@ -2345,21 +2498,21 @@ def contains_kind(children, nodekind): # Main body of parse_linkage() text = wxr.wtp.node_to_wikitext(linkagenode.children) - parsed = wxr.wtp.parse(text, expand_all=True, - template_fn=linkage_template_fn1) + parsed = wxr.wtp.parse( + text, expand_all=True, template_fn=linkage_template_fn1 + ) if field == "synonyms" and lang_code == "zh": synonyms = [] if contains_kind(parsed.children, NodeKind.LIST): parse_zh_synonyms_list(parsed.children, synonyms, [], "") else: parse_zh_synonyms(parsed.children, synonyms, [], "") - #print(json.dumps(synonyms, indent=4, ensure_ascii=False)) + # print(json.dumps(synonyms, indent=4, ensure_ascii=False)) data_extend(data, "synonyms", synonyms) parse_linkage_recurse(parsed.children, field, None) if not data.get(field) and not have_panel_template: text = "".join(toplevel_text).strip() - if ("\n" not in text and "," in text and - text.count(",") > 3): + if "\n" not in text and "," in text and text.count(",") > 3: if not text.startswith("See "): parse_linkage_item([text], field, None) @@ -2388,8 +2541,10 @@ def parse_translation_item(contents, lang=None): # print("sense <- clean_node: ", sense) idx = sense.find("See also translations at") if idx > 0: - wxr.wtp.debug("Skipping translation see also: {}".format(sense), - sortid="page/2361") + wxr.wtp.debug( + "Skipping translation see also: {}".format(sense), + sortid="page/2361", + ) sense = sense[:idx].strip() if sense.endswith(":"): sense = sense[:-1].strip() @@ -2412,10 +2567,13 @@ def translation_item_template_fn(name, ht): code = ht.get(1) if code: if langcode and code != langcode: - wxr.wtp.debug("inconsistent language codes {} vs " - "{} in translation item: {!r} {}" - .format(langcode, code, name, ht), - sortid="page/2386") + wxr.wtp.debug( + "inconsistent language codes {} vs " + "{} in translation item: {!r} {}".format( + langcode, code, name, ht + ), + sortid="page/2386", + ) langcode = code tr = ht.get(2) if tr: @@ -2431,8 +2589,9 @@ def translation_item_template_fn(name, ht): langcode = code return None if name == "trans-see": - wxr.wtp.error("UNIMPLEMENTED trans-see template", - sortid="page/2405") + wxr.wtp.error( + "UNIMPLEMENTED trans-see template", sortid="page/2405" + ) return "" if name.endswith("-top"): return "" @@ -2440,28 +2599,41 @@ def translation_item_template_fn(name, ht): return "" if name.endswith("-mid"): return "" - #wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" + # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" # .format(name), # sortid="page/2414") return None - sublists = list(x for x in contents - if isinstance(x, WikiNode) and - x.kind == NodeKind.LIST) - contents = list(x for x in contents - if not isinstance(x, WikiNode) or - x.kind != NodeKind.LIST) + sublists = list( + x + for x in contents + if isinstance(x, WikiNode) and x.kind == NodeKind.LIST + ) + contents = list( + x + for x in contents + if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST + ) - item = clean_node(wxr, data, contents, - template_fn=translation_item_template_fn) + item = clean_node( + wxr, data, contents, template_fn=translation_item_template_fn + ) # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) # Parse the translation item. if item: - lang = parse_translation_item_text(wxr, word, data, item, sense, - pos_datas, lang, langcode, - translations_from_template, - is_reconstruction) + lang = parse_translation_item_text( + wxr, + word, + data, + item, + sense, + pos_datas, + lang, + langcode, + translations_from_template, + is_reconstruction, + ) # Handle sublists. They are frequently used for different scripts # for the language and different variants of the language. We will @@ -2495,8 +2667,9 @@ def template_fn(name, ht): sense = None sub = ht.get(1, "") if sub: - m = re.match(r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", - sub) + m = re.match( + r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub + ) else: m = None etym = "" @@ -2507,51 +2680,83 @@ def template_fn(name, ht): etym = m.group(2) pos = m.group(3) if not sub: - wxr.wtp.debug("no part-of-speech in " - "{{see translation subpage|...}}, " - "defaulting to just wxr.wtp.section " - "(= language)", - sortid="page/2468") + wxr.wtp.debug( + "no part-of-speech in " + "{{see translation subpage|...}}, " + "defaulting to just wxr.wtp.section " + "(= language)", + sortid="page/2468", + ) # seq sent to get_subpage_section without sub and pos - seq = [language, wxr.config.OTHER_SUBTITLES["translations"]] - elif (m and etym.lower().strip() - in wxr.config.OTHER_SUBTITLES["etymology"] - and pos.lower() in wxr.config.POS_SUBTITLES): - seq = [language, - etym_numbered, - pos, - wxr.config.OTHER_SUBTITLES["translations"]] + seq = [ + language, + wxr.config.OTHER_SUBTITLES["translations"], + ] + elif ( + m + and etym.lower().strip() + in wxr.config.OTHER_SUBTITLES["etymology"] + and pos.lower() in wxr.config.POS_SUBTITLES + ): + seq = [ + language, + etym_numbered, + pos, + wxr.config.OTHER_SUBTITLES["translations"], + ] elif sub.lower() in wxr.config.POS_SUBTITLES: # seq with sub but not pos - seq = [language, - sub, - wxr.config.OTHER_SUBTITLES["translations"]] + seq = [ + language, + sub, + wxr.config.OTHER_SUBTITLES["translations"], + ] else: # seq with sub and pos pos = wxr.wtp.subsection if pos.lower() not in wxr.config.POS_SUBTITLES: - wxr.wtp.debug("unhandled see translation subpage: " - "language={} sub={} wxr.wtp.subsection={}" - .format(language, sub, wxr.wtp.subsection), - sortid="page/2478") - seq = [language, - sub, - pos, - wxr.config.OTHER_SUBTITLES["translations"]] + wxr.wtp.debug( + "unhandled see translation subpage: " + "language={} sub={} wxr.wtp.subsection={}".format( + language, sub, wxr.wtp.subsection + ), + sortid="page/2478", + ) + seq = [ + language, + sub, + pos, + wxr.config.OTHER_SUBTITLES["translations"], + ] subnode = get_subpage_section( - wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq) + wxr.wtp.title, + wxr.config.OTHER_SUBTITLES["translations"], + seq, + ) if subnode is not None: parse_translations(data, subnode) else: # Failed to find the normal subpage section seq = [wxr.config.OTHER_SUBTITLES["translations"]] subnode = get_subpage_section( - wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq) + wxr.wtp.title, + wxr.config.OTHER_SUBTITLES["translations"], + seq, + ) if subnode is not None: parse_translations(data, subnode) return "" - if name in ("c", "C", "categorize", "cat", "catlangname", - "topics", "top", "qualifier", "cln"): + if name in ( + "c", + "C", + "categorize", + "cat", + "catlangname", + "topics", + "top", + "qualifier", + "cln", + ): # These are expanded in the default way return None if name in ("trans-top",): @@ -2564,8 +2769,12 @@ def template_fn(name, ht): sense_parts = [] sense = None return None - if name in ("trans-bottom", "trans-mid", - "checktrans-mid", "checktrans-bottom"): + if name in ( + "trans-bottom", + "trans-mid", + "checktrans-mid", + "checktrans-bottom", + ): return None if name == "checktrans-top": sense_parts = [] @@ -2576,11 +2785,17 @@ def template_fn(name, ht): sense_parts = [] sense = None return "" - wxr.wtp.error("UNIMPLEMENTED parse_translation_template: {} {}" - .format(name, ht), - sortid="page/2517") + wxr.wtp.error( + "UNIMPLEMENTED parse_translation_template: {} {}".format( + name, ht + ), + sortid="page/2517", + ) return "" - wxr.wtp.expand(wxr.wtp.node_to_wikitext(node), template_fn=template_fn) + + wxr.wtp.expand( + wxr.wtp.node_to_wikitext(node), template_fn=template_fn + ) def parse_translation_recurse(xlatnode): nonlocal sense @@ -2590,9 +2805,11 @@ def parse_translation_recurse(xlatnode): if isinstance(node, str): if sense: if not node.isspace(): - wxr.wtp.debug("skipping string in the middle of " - "translations: {}".format(node), - sortid="page/2530") + wxr.wtp.debug( + "skipping string in the middle of " + "translations: {}".format(node), + sortid="page/2530", + ) continue # Add a part to the sense sense_parts.append(node) @@ -2616,8 +2833,11 @@ def parse_translation_recurse(xlatnode): pass elif kind == NodeKind.TEMPLATE: parse_translation_template(node) - elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW, - NodeKind.TABLE_CELL): + elif kind in ( + NodeKind.TABLE, + NodeKind.TABLE_ROW, + NodeKind.TABLE_CELL, + ): parse_translation_recurse(node) elif kind == NodeKind.HTML: if node.attrs.get("class") == "NavFrame": @@ -2636,8 +2856,7 @@ def parse_translation_recurse(xlatnode): elif kind in LEVEL_KINDS: # Sub-levels will be recursed elsewhere pass - elif kind in (NodeKind.ITALIC, - NodeKind.BOLD): + elif kind in (NodeKind.ITALIC, NodeKind.BOLD): parse_translation_recurse(node) elif kind == NodeKind.PREFORMATTED: print("parse_translation_recurse: PREFORMATTED:", node) @@ -2650,29 +2869,53 @@ def parse_translation_recurse(xlatnode): # handle them. Note: must be careful not to read other # links, particularly things like in "human being": # "a human being -- see [[man/translations]]" (group title) - if (isinstance(arg0, (list, tuple)) and - arg0 and - isinstance(arg0[0], str) and - arg0[0].endswith("/" + wxr.config.OTHER_SUBTITLES["translations"]) and - arg0[0][:-(1 + len(wxr.config.OTHER_SUBTITLES["translations"]))] == wxr.wtp.title): - wxr.wtp.debug("translations subpage link found on main " - "page instead " - "of normal {{see translation subpage|...}}", - sortid="page/2595") + if ( + isinstance(arg0, (list, tuple)) + and arg0 + and isinstance(arg0[0], str) + and arg0[0].endswith( + "/" + wxr.config.OTHER_SUBTITLES["translations"] + ) + and arg0[0][ + : -( + 1 + + len( + wxr.config.OTHER_SUBTITLES["translations"] + ) + ) + ] + == wxr.wtp.title + ): + wxr.wtp.debug( + "translations subpage link found on main " + "page instead " + "of normal {{see translation subpage|...}}", + sortid="page/2595", + ) sub = wxr.wtp.subsection if sub.lower() in wxr.config.POS_SUBTITLES: - seq = [language, sub, wxr.config.OTHER_SUBTITLES["translations"]] + seq = [ + language, + sub, + wxr.config.OTHER_SUBTITLES["translations"], + ] subnode = get_subpage_section( - wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq) + wxr.wtp.title, + wxr.config.OTHER_SUBTITLES["translations"], + seq, + ) if subnode is not None: parse_translations(data, subnode) else: - wxr.wtp.errors("/translations link outside " - "part-of-speech") + wxr.wtp.errors( + "/translations link outside " "part-of-speech" + ) - if (len(arg0) >= 1 and - isinstance(arg0[0], str) and - not arg0[0].lower().startswith("category:")): + if ( + len(arg0) >= 1 + and isinstance(arg0[0], str) + and not arg0[0].lower().startswith("category:") + ): for x in node.largs[-1]: if isinstance(x, str): sense_parts.append(x) @@ -2681,9 +2924,11 @@ def parse_translation_recurse(xlatnode): elif not sense: sense_parts.append(node) else: - wxr.wtp.debug("skipping text between translation items/senses: " - "{}".format(node), - sortid="page/2621") + wxr.wtp.debug( + "skipping text between translation items/senses: " + "{}".format(node), + sortid="page/2621", + ) # Main code of parse_translation(). We want ``sense`` to be assigned # regardless of recursion levels, and thus the code is structured @@ -2720,17 +2965,25 @@ def etym_post_template_fn(name, ht, expansion): if ignore_count == 0: ht = clean_template_args(wxr, ht) expansion = clean_node(wxr, None, expansion) - templates.append({"name": name, "args": ht, "expansion": expansion}) + templates.append( + {"name": name, "args": ht, "expansion": expansion} + ) return None # Remove any subsections - contents = list(x for x in node.children - if not isinstance(x, WikiNode) or - x.kind not in LEVEL_KINDS) + contents = list( + x + for x in node.children + if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS + ) # Convert to text, also capturing templates using post_template_fn - text = clean_node(wxr, None, contents, - template_fn=etym_template_fn, - post_template_fn=etym_post_template_fn) + text = clean_node( + wxr, + None, + contents, + template_fn=etym_template_fn, + post_template_fn=etym_post_template_fn, + ) # Save the collected information. data["etymology_text"] = text data["etymology_templates"] = templates @@ -2804,20 +3057,23 @@ def desc_post_template_fn(name, ht, expansion): # same proto-language, then we tag this descendant entry with # "derived" is_derived = ( - is_proto_root_derived_section and - (name == "l" or name == "link") and - ("1" in ht and ht["1"] == lang_code) + is_proto_root_derived_section + and (name == "l" or name == "link") + and ("1" in ht and ht["1"] == lang_code) ) expansion = clean_node(wxr, None, expansion) - templates.append({ - "name": name, "args": ht, "expansion": expansion - }) + templates.append( + {"name": name, "args": ht, "expansion": expansion} + ) return None - text = clean_node(wxr, None, children, - template_fn=desc_template_fn, - post_template_fn=desc_post_template_fn - ) + text = clean_node( + wxr, + None, + children, + template_fn=desc_template_fn, + post_template_fn=desc_post_template_fn, + ) item_data["templates"] = templates item_data["text"] = text if is_derived: @@ -2837,11 +3093,15 @@ def get_sublist_index(list_item): def get_descendants(node): """Appends the data for every list item in every list in node - to descendants.""" + to descendants.""" for _, c in node_children(node): - if (c.kind == NodeKind.TEMPLATE and c.largs - and len(c.largs[0]) == 1 and isinstance(c.largs[0][0], str) - and c.largs[0][0] in unignored_non_list_templates): + if ( + c.kind == NodeKind.TEMPLATE + and c.largs + and len(c.largs[0]) == 1 + and isinstance(c.largs[0][0], str) + and c.largs[0][0] in unignored_non_list_templates + ): # Some Descendants sections have no wikitext list. Rather, # the list is entirely generated by a single template (see # e.g. the use of {{CJKV}} in Chinese entries). @@ -2914,40 +3174,48 @@ def skip_template_fn(name, ht): if node.kind not in LEVEL_KINDS: # XXX handle e.g. wikipedia links at the top of a language # XXX should at least capture "also" at top of page - if node.kind in (NodeKind.HLINE, NodeKind.LIST, - NodeKind.LIST_ITEM): + if node.kind in ( + NodeKind.HLINE, + NodeKind.LIST, + NodeKind.LIST_ITEM, + ): continue # print(" UNEXPECTED: {}".format(node)) # Clean the node to collect category links - clean_node(wxr, etym_data, node, - template_fn=skip_template_fn) + clean_node(wxr, etym_data, node, template_fn=skip_template_fn) continue - t = clean_node(wxr, etym_data, - node.sarg if node.sarg else node.largs) + t = clean_node( + wxr, etym_data, node.sarg if node.sarg else node.largs + ) t = t.lower() # XXX these counts were never implemented fully, and even this # gets discarded: Search STATISTICS_IMPLEMENTATION wxr.config.section_counts[t] += 1 # print("PROCESS_CHILDREN: T:", repr(t)) if t.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])): - if t.startswith(tuple( + if t.startswith( + tuple( pron_title + " " - for pron_title in - wxr.config.OTHER_SUBTITLES.get("pronunciation", []))): + for pron_title in wxr.config.OTHER_SUBTITLES.get( + "pronunciation", [] + ) + ) + ): # Pronunciation 1, etc, are used in Chinese Glyphs, # and each of them may have senses under Definition push_etym() wxr.wtp.start_subsection(None) if wxr.config.capture_pronunciation: data = select_data() - parse_pronunciation(wxr, - node, - data, - etym_data, - have_etym, - base_data, - lang_code, - ) + parse_pronunciation( + wxr, + node, + data, + etym_data, + have_etym, + base_data, + lang_code, + ) elif t.startswith(tuple(wxr.config.OTHER_SUBTITLES["etymology"])): push_etym() wxr.wtp.start_subsection(None) @@ -2963,11 +3231,13 @@ def skip_template_fn(name, ht): data = select_data() parse_descendants(data, node) elif ( - t in wxr.config.OTHER_SUBTITLES.get( + t + in wxr.config.OTHER_SUBTITLES.get( "proto_root_derived_sections", [] ) - and pos == "root" and is_reconstruction and - wxr.config.capture_descendants + and pos == "root" + and is_reconstruction + and wxr.config.capture_descendants ): data = select_data() parse_descendants(data, node, True) @@ -2989,17 +3259,20 @@ def skip_template_fn(name, ht): pos = dt["pos"] wxr.wtp.start_subsection(t) if "debug" in dt: - wxr.wtp.debug("{} in section {}" - .format(dt["debug"], t), - sortid="page/2755") + wxr.wtp.debug( + "{} in section {}".format(dt["debug"], t), + sortid="page/2755", + ) if "warning" in dt: - wxr.wtp.warning("{} in section {}" - .format(dt["warning"], t), - sortid="page/2759") + wxr.wtp.warning( + "{} in section {}".format(dt["warning"], t), + sortid="page/2759", + ) if "error" in dt: - wxr.wtp.error("{} in section {}" - .format(dt["error"], t), - sortid="page/2763") + wxr.wtp.error( + "{} in section {}".format(dt["error"], t), + sortid="page/2763", + ) # Parse word senses for the part-of-speech parse_part_of_speech(node, pos) if "tags" in dt: @@ -3057,9 +3330,9 @@ def usex_template_fn(name, ht): elif name in quotation_templates: usex_type = "quotation" for prefix, t in template_linkage_mappings: - if re.search(r"(^|[-/\s]){}($|\b|[0-9])" - .format(prefix), - name): + if re.search( + r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name + ): return "" return None @@ -3068,23 +3341,32 @@ def usex_template_fn(name, ht): contents = item.children if lang_code == "ja": # print(contents) - if (contents and isinstance(contents, str) and - re.match(r"\s*$", contents[0])): + if ( + contents + and isinstance(contents, str) + and re.match(r"\s*$", contents[0]) + ): contents = contents[1:] - exp = wxr.wtp.parse(wxr.wtp.node_to_wikitext(contents), - # post_template_fn=head_post_template_fn, - expand_all=True) + exp = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(contents), + # post_template_fn=head_post_template_fn, + expand_all=True, + ) rub, rest = extract_ruby(wxr, exp.children) if rub: for r in rub: ruby.append(r) contents = rest - subtext = clean_node(wxr, sense_base, contents, - template_fn=usex_template_fn) - subtext = re.sub(r"\s*\(please add an English " - r"translation of this " - r"(example|usage example|quote)\)", - "", subtext).strip() + subtext = clean_node( + wxr, sense_base, contents, template_fn=usex_template_fn + ) + subtext = re.sub( + r"\s*\(please add an English " + r"translation of this " + r"(example|usage example|quote)\)", + "", + subtext, + ).strip() subtext = re.sub(r"\^\([^)]*\)", "", subtext) subtext = re.sub(r"\s*[―—]+$", "", subtext) # print("subtext:", repr(subtext)) @@ -3093,17 +3375,21 @@ def usex_template_fn(name, ht): # print(lines) lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) - lines = list(x for x in lines - if not re.match( - r"(Synonyms: |Antonyms: |Hyponyms: |" - r"Synonym: |Antonym: |Hyponym: |" - r"Hypernyms: |Derived terms: |" - r"Related terms: |" - r"Hypernym: |Derived term: |" - r"Coordinate terms:|" - r"Related term: |" - r"For more quotations using )", - x)) + lines = list( + x + for x in lines + if not re.match( + r"(Synonyms: |Antonyms: |Hyponyms: |" + r"Synonym: |Antonym: |Hyponym: |" + r"Hypernyms: |Derived terms: |" + r"Related terms: |" + r"Hypernym: |Derived term: |" + r"Coordinate terms:|" + r"Related term: |" + r"For more quotations using )", + x, + ) + ) tr = "" ref = "" roman = "" @@ -3112,26 +3398,28 @@ def usex_template_fn(name, ht): # print(classify_desc(line)) if len(lines) == 1 and lang_code != "en": parts = re.split(r"\s*[―—]+\s*", lines[0]) - if (len(parts) == 2 and - classify_desc(parts[1]) == "english"): + if len(parts) == 2 and classify_desc(parts[1]) == "english": lines = [parts[0].strip()] tr = parts[1].strip() - elif (len(parts) == 3 and - classify_desc(parts[1]) in ("romanization", - "english") and - classify_desc(parts[2]) == "english"): + elif ( + len(parts) == 3 + and classify_desc(parts[1]) + in ("romanization", "english") + and classify_desc(parts[2]) == "english" + ): lines = [parts[0].strip()] roman = parts[1].strip() tr = parts[2].strip() else: parts = re.split(r"\s+-\s+", lines[0]) - if (len(parts) == 2 and - classify_desc(parts[1]) == "english"): + if ( + len(parts) == 2 + and classify_desc(parts[1]) == "english" + ): lines = [parts[0].strip()] tr = parts[1].strip() elif len(lines) > 1: - if any(re.search(r"[]\d:)]\s*$", x) - for x in lines[:-1]): + if any(re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]): ref = [] for i in range(len(lines)): if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): @@ -3140,13 +3428,17 @@ def usex_template_fn(name, ht): if re.search(r"[]\d:)]\s*$", lines[i]): break ref = " ".join(ref) - lines = lines[i + 1:] - if (lang_code != "en" and len(lines) >= 2 and - classify_desc(lines[-1]) == "english"): + lines = lines[i + 1 :] + if ( + lang_code != "en" + and len(lines) >= 2 + and classify_desc(lines[-1]) == "english" + ): i = len(lines) - 1 - while (i > 1 and - classify_desc(lines[i - 1]) - == "english"): + while ( + i > 1 + and classify_desc(lines[i - 1]) == "english" + ): i -= 1 tr = "\n".join(lines[i:]) lines = lines[:i] @@ -3155,8 +3447,7 @@ def usex_template_fn(name, ht): roman = lines[-1].strip() lines = lines[:-1] - elif (lang_code == "en" and - re.match(r"^[#*]*:+", lines[1])): + elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): ref = lines[0] lines = lines[1:] elif lang_code != "en" and len(lines) == 2: @@ -3168,9 +3459,13 @@ def usex_template_fn(name, ht): elif cls1 == "english" and cls2 != "english": tr = lines[0] lines = [lines[1]] - elif (re.match(r"^[#*]*:+", lines[1]) and - classify_desc(re.sub(r"^[#*:]+\s*", "", - lines[1])) == "english"): + elif ( + re.match(r"^[#*]*:+", lines[1]) + and classify_desc( + re.sub(r"^[#*:]+\s*", "", lines[1]) + ) + == "english" + ): tr = re.sub(r"^[#*:]+\s*", "", lines[1]) lines = [lines[0]] elif cls1 == "english" and cls2 == "english": @@ -3179,20 +3474,27 @@ def usex_template_fn(name, ht): # non-English, as that seems more common. tr = lines[1] lines = [lines[0]] - elif (usex_type != "quotation" and - lang_code != "en" and - len(lines) == 3): + elif ( + usex_type != "quotation" + and lang_code != "en" + and len(lines) == 3 + ): cls1 = classify_desc(lines[0]) cls2 = classify_desc(lines[1]) cls3 = classify_desc(lines[2]) - if (cls3 == "english" and - cls2 in ["english", "romanization"] and - cls1 != "english"): + if ( + cls3 == "english" + and cls2 in ["english", "romanization"] + and cls1 != "english" + ): tr = lines[2].strip() roman = lines[1].strip() lines = [lines[0].strip()] - elif (usex_type == "quotation" and - lang_code != "en" and len(lines) > 2): + elif ( + usex_type == "quotation" + and lang_code != "en" + and len(lines) > 2 + ): # for x in lines: # print(" LINE: {}: {}" # .format(classify_desc(x), x)) @@ -3202,9 +3504,10 @@ def usex_template_fn(name, ht): cls1 = classify_desc(lines[-1]) if cls1 == "english": i = len(lines) - 1 - while (i > 1 and - classify_desc(lines[i - 1]) - == "english"): + while ( + i > 1 + and classify_desc(lines[i - 1]) == "english" + ): i -= 1 tr = "\n".join(lines[i:]) lines = lines[:i] @@ -3215,10 +3518,13 @@ def usex_template_fn(name, ht): tr = re.sub(r"[ \t\r]+", " ", tr).strip() tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) ref = re.sub(r"^[#*:]+\s*", "", ref) - ref = re.sub(r", (volume |number |page )?“?" - r"\(please specify ([^)]|\(s\))*\)”?|" - ", text here$", - "", ref) + ref = re.sub( + r", (volume |number |page )?“?" + r"\(please specify ([^)]|\(s\))*\)”?|" + ", text here$", + "", + ref, + ) ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) subtext = "\n".join(x for x in lines if x) @@ -3226,30 +3532,41 @@ def usex_template_fn(name, ht): m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) if m and classify_desc(m.group(2)) == "english": tr = m.group(2) - subtext = subtext[:m.start()] + m.group(1) + subtext = subtext[: m.start()] + m.group(1) elif lines: parts = re.split(r"\s*[―—]+\s*", lines[0]) - if (len(parts) == 2 and - classify_desc(parts[1]) == "english"): + if ( + len(parts) == 2 + and classify_desc(parts[1]) == "english" + ): subtext = parts[0].strip() tr = parts[1].strip() - subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", - subtext) - subtext = re.sub(r"(please add an English translation of " - r"this (quote|usage example))", - "", subtext) - subtext = re.sub(r"\s*→New International Version " - "translation$", - "", subtext) # e.g. pis/Tok Pisin (Bible) + subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) + subtext = re.sub( + r"(please add an English translation of " + r"this (quote|usage example))", + "", + subtext, + ) + subtext = re.sub( + r"\s*→New International Version " "translation$", + "", + subtext, + ) # e.g. pis/Tok Pisin (Bible) subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) note = None m = re.match(r"^\(([^)]*)\):\s+", subtext) - if (m is not None and lang_code != "en" and - (m.group(1).startswith("with ") or - classify_desc(m.group(1)) == "english")): + if ( + m is not None + and lang_code != "en" + and ( + m.group(1).startswith("with ") + or classify_desc(m.group(1)) == "english" + ) + ): note = m.group(1) - subtext = subtext[m.end():] + subtext = subtext[m.end() :] ref = re.sub(r"\s*\(→ISBN\)", "", ref) ref = re.sub(r",\s*→ISBN", "", ref) ref = ref.strip() @@ -3278,7 +3595,6 @@ def usex_template_fn(name, ht): return examples - # Main code of parse_language() # Process the section stack.append(language) @@ -3358,9 +3674,10 @@ def top_template_fn(name, ht): if arg.startswith("Q") or arg.startswith("Lexeme:L"): data_append(data, "wikidata", arg) return "" - wxr.wtp.debug("UNIMPLEMENTED top-level template: {} {}" - .format(name, ht), - sortid="page/2870") + wxr.wtp.debug( + "UNIMPLEMENTED top-level template: {} {}".format(name, ht), + sortid="page/2870", + ) return "" clean_node(wxr, None, [node], template_fn=top_template_fn) @@ -3373,9 +3690,9 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: # Known lowercase PoS names are in part_of_speech_map # Known lowercase linkage section names are in linkage_map - old = re.split(r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" - r"[ \t]*(==+)[ \t]*$", - text) + old = re.split( + r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text + ) parts = [] npar = 4 # Number of parentheses in above expression @@ -3389,22 +3706,29 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: level = len(left) part = old[i + npar] if level != len(right): - wxr.wtp.debug("subtitle has unbalanced levels: " - "{!r} has {} on the left and {} on the right" - .format(title, left, right), - sortid="page/2904") + wxr.wtp.debug( + "subtitle has unbalanced levels: " + "{!r} has {} on the left and {} on the right".format( + title, left, right + ), + sortid="page/2904", + ) lc = title.lower() if name_to_code(title, "en") != "": if level > 2: - wxr.wtp.debug("subtitle has language name {} at level {}" - .format(title, level), - sortid="page/2911") + wxr.wtp.debug( + "subtitle has language name {} at level {}".format( + title, level + ), + sortid="page/2911", + ) level = 2 elif lc.startswith(tuple(wxr.config.OTHER_SUBTITLES["etymology"])): if level > 3: - wxr.wtp.debug("etymology section {} at level {}" - .format(title, level), - sortid="page/2917") + wxr.wtp.debug( + "etymology section {} at level {}".format(title, level), + sortid="page/2917", + ) level = 3 elif lc.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])): level = 3 @@ -3473,7 +3797,7 @@ def parse_page( text, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES + do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, ) # from wikitextprocessor.parser import print_tree # print("PAGE PARSE:", print_tree(tree)) @@ -3521,7 +3845,7 @@ def parse_page( if "lang" not in data: wxr.wtp.debug( "internal error -- no lang in data: {}".format(data), - sortid="page/3034" + sortid="page/3034", ) continue for k, v in top_data.items(): @@ -3552,16 +3876,26 @@ def parse_page( if not conjs: continue cpos = dt.get("pos") - if (pos == cpos or - (pos, cpos) in (("noun", "adj"), - ("noun", "name"), - ("name", "noun"), - ("name", "adj"), - ("adj", "noun"), - ("adj", "name")) or - (pos == "adj" and cpos == "verb" and - any("participle" in s.get("tags", ()) - for s in dt.get("senses", ())))): + if ( + pos == cpos + or (pos, cpos) + in ( + ("noun", "adj"), + ("noun", "name"), + ("name", "noun"), + ("name", "adj"), + ("adj", "noun"), + ("adj", "name"), + ) + or ( + pos == "adj" + and cpos == "verb" + and any( + "participle" in s.get("tags", ()) + for s in dt.get("senses", ()) + ) + ) + ): data["conjugation"] = list(conjs) # Copy list! break # Add topics from the last sense of a language to its other senses, @@ -3579,13 +3913,14 @@ def parse_page( for x in ret: if x["word"] != word: if word.startswith("Unsupported titles/"): - wxr.wtp.debug(f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", - sortid="20231101/3578page.py" - ) + wxr.wtp.debug( + f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", + sortid="20231101/3578page.py", + ) else: - wxr.wtp.debug(f"DIFFERENT ORIGINAL TITLE: '{word}' " - f"-> '{x['word']}'", - sortid="20231101/3582page.py" - ) + wxr.wtp.debug( + f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'", + sortid="20231101/3582page.py", + ) x["original_title"] = word return ret From 82ab4145408448cb8eb7faacbc5428dd0eb032b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 3 Jan 2024 10:49:36 +0200 Subject: [PATCH 2/8] Fixed minor bug with Thesaurus aliases This code used ns_title_prefix_tuple to get a list of aliases for 'Thesaurus', but then use a constant w[10:] to remove "Thesaurus" only from the start of `w`. This might never have actually triggered, if there are now aliases for "Thesaurus", though! The more correct way to fix this is to iterate over the return from ns_title_prefix_tuple, but a less correct (but probably also correct) would be to just (revert back to?) just checking w.startswith("Thesaurus")... --- src/wiktextract/extractor/en/page.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 9aa6facd4..44341ce74 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -679,8 +679,10 @@ def parse_sense_linkage( for i in range(2, 20): w = ht.get(i) or "" w = clean_node(wxr, data, w) - if w.startswith(ns_title_prefix_tuple(wxr, "Thesaurus")): - w = w[10:] + for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): + if w.startswith(alias): + w = w[len(alias):] + break if not w: break tags: list[str] = [] From 1732011af62090e2df22fc556bec406aeae1c301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 3 Jan 2024 13:29:03 +0200 Subject: [PATCH 3/8] Minor type-check stuff --- src/wiktextract/extractor/en/page.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 44341ce74..0322bbbdb 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -687,7 +687,7 @@ def parse_sense_linkage( break tags: list[str] = [] topics: list[str] = [] - english = None + english: Optional[str] = None # Try to find qualifiers for this synonym q = ht.get("q{}".format(i - 1)) if q: @@ -764,14 +764,14 @@ def parse_language( base_data = {"word": word, "lang": language, "lang_code": lang_code} if is_reconstruction: data_append(base_data, "tags", "reconstruction") - sense_data = {} - pos_data = {} # For a current part-of-speech - etym_data = {} # For one etymology - pos_datas = [] - etym_datas = [] - page_datas = [] + sense_data: WordData = {} + pos_data: WordData = {} # For a current part-of-speech + etym_data: WordData = {} # For one etymology + pos_datas: list[WordData] = [] + etym_datas: list[WordData] = [] + page_datas: list[WordData] = [] have_etym = False - stack = [] + stack: list[str] = [] # names of items on the "stack" def merge_base(data, base): for k, v in base.items(): From cc684eeb30ba006295134f8506a3e449f6cd90f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 3 Jan 2024 13:29:32 +0200 Subject: [PATCH 4/8] Fix a stupid bug regarding "sounds" and pos Is there anything wrong than seeing a piece of code, thinking to yourself "that's not right, that can't work?", searching for a keyword in there to figure out what the original commit was that introduced this code, and then seeing your own name on that commit? Yeah, this piece of code of has a small error that basically invalidated the whole purpose of the changes made. Happily, the error was to let through everything, which is still *ok*, but the desired result was the filter certain things. The error was: I created a function to remove "pos" fields from "sounds" data when adding that data to a list. The first part of the block was about filtering out "sounds" data that didn't match the word or the forms of the word we were being processed, and the *second* part of the block was about filtering for wrong "pos" data and then removing the "pos" sections.. But I used the pos-removing function in the first part of the block, which meant there were no pos-sections to compare in the second. Because there was not pos-data, all the sounds were let through. --- src/wiktextract/extractor/en/page.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 0322bbbdb..2a0a0408a 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -773,7 +773,7 @@ def parse_language( have_etym = False stack: list[str] = [] # names of items on the "stack" - def merge_base(data, base): + def merge_base(data: WordData, base: WordData) -> None: for k, v in base.items(): # Copy the value to ensure that we don't share lists or # dicts between structures (even nested ones). @@ -785,9 +785,9 @@ def merge_base(data, base): if data[k] == v: continue if isinstance(data[k], (list, tuple)) or isinstance( - v, (list, tuple) + v, (list, tuple) # Should this be "and"? ): - data[k] = list(data[k]) + list(v) + data[k] = list(data[k]) + list(v) # type: ignore elif data[k] != v: wxr.wtp.warning( "conflicting values for {} in merge_base: " @@ -795,7 +795,8 @@ def merge_base(data, base): sortid="page/904", ) - def complementary_pop(pron, key): + def complementary_pop(pron: WordData, key: str + ) -> WordData: """Remove unnecessary keys from dict values in a list comprehension...""" if key in pron: @@ -806,19 +807,19 @@ def complementary_pop(pron, key): # does not match "word" or one of "forms" if "sounds" in data and "word" in data: accepted = [data["word"]] - accepted.extend(f["form"] for f in data.get("forms", ())) + accepted.extend(f["form"] for f in data.get("forms", dict())) # type:ignore data["sounds"] = list( - complementary_pop(s, "pos") - for s in data["sounds"] - if "form" not in s or s["form"] in accepted + s + for s in data["sounds"] # type:ignore + if "form" not in s or s["form"] in accepted # type:ignore ) # If the result has sounds, eliminate sounds that have a pos that # does not match "pos" if "sounds" in data and "pos" in data: data["sounds"] = list( - s - for s in data["sounds"] - if "pos" not in s or s["pos"] == data["pos"] + complementary_pop(s, "pos") # type:ignore + for s in data["sounds"] # type: ignore + if "pos" not in s or s["pos"] == data["pos"] # type:ignore ) def push_sense(): From 0461d64796b8ae8699038355dfa3fd200ec2ef1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Fri, 19 Jan 2024 10:27:56 +0200 Subject: [PATCH 5/8] Map out data types for word data Currently these are TypedDicts with `total=False`, but it is possible in Python 3.9 to have TypedDicts with mixed required and optional fields by using inheritance; one parent TypedDict has the required fields (`total=True`), the other doesn't, and the child TypedDict class has mixed requirements. This can be done later. --- src/wiktextract/type_utils.py | 175 ++++++++++++++++++++++++++++++++-- 1 file changed, 166 insertions(+), 9 deletions(-) diff --git a/src/wiktextract/type_utils.py b/src/wiktextract/type_utils.py index 389b541e9..ea2bfa3de 100644 --- a/src/wiktextract/type_utils.py +++ b/src/wiktextract/type_utils.py @@ -1,14 +1,171 @@ from typing import ( + Sequence, + TypedDict, Union, ) -WordData = dict[str, Union[ - str, - int, - list[str], - list[list[str]], - "WordData", - list["WordData"] - ] - ] +class AltOf(TypedDict, total=False): + word: str + extra: str + + +class LinkageData(TypedDict, total=False): + alt: str + english: str + extra: str + qualifier: str + roman: str + ruby: list[Sequence[str]] + sense: str + source: str + tags: list[str] + taxonomic: str + topics: list[str] + urls: list[str] + word: str + + +class ExampleData(TypedDict, total=False): + english: str + note: str + ref: str + roman: str + ruby: list[Sequence[str]] + text: str + type: str + + +class FormOf(TypedDict, total=False): + word: str + extra: str + roman: str + + +LinkData = list[Sequence[str]] + + +class TemplateData(TypedDict, total=False): + args: dict[str, str] + expansion: str + name: str + + +class DescendantData(TypedDict, total=False): + depth: int + tags: list[str] + templates: TemplateData + text: str + + +class FormData(TypedDict, total=False): + form: str + head_nr: int + ipa: str + roman: str + ruby: list[Sequence[str]] + source: str + tags: list[str] + topics: list[str] + + +SoundData = TypedDict( + "SoundData", + { + "audio": str, + "audio-ipa": str, + "enpr": str, + "form": str, + "homophone": str, + "ipa": str, + "mp3_url": str, + "note": str, + "ogg_url": str, + "other": str, + "rhymes": str, + "tags": list[str], + "text": str, + "topics": list[str], + "zh-pron": str, + }, + total=False, +) + + +class TranslationData(TypedDict, total=False): + alt: str + code: str + english: str + lang: str + note: str + roman: str + sense: str + tags: list[str] + taxonomic: str + topics: list[str] + word: str + + +class SenseData(TypedDict, total=False): + alt_of: list[AltOf] + antonyms: list[LinkageData] + categories: list[str] + compound_of: list[AltOf] + coordinate_terms: list[LinkageData] + examples: list[ExampleData] + form_of: list[FormOf] + glosses: list[str] + head_nr: int + holonyms: list[LinkageData] + hypernyms: list[LinkageData] + hyponyms: list[LinkageData] + instances: list[LinkageData] + links: list[LinkData] + meronyms: list[LinkageData] + qualifier: str + raw_glosses: list[str] + related: list[LinkageData] + senseid: list[str] + synonyms: list[LinkageData] + tags: list[str] + topics: list[str] + wikidata: list[str] + wikipedia: list[str] + + +class WordData(TypedDict, total=False): + abbreviations: list[LinkageData] + alt_of: list[AltOf] + antonyms: list[LinkageData] + categories: list[str] + coordinate_terms: list[LinkageData] + derived: list[LinkageData] + descendants: list[DescendantData] + etymology_number: int + etymology_templates: list[TemplateData] + etymology_text: str + form_of: list[FormOf] + forms: list[FormData] + head_templates: list[TemplateData] + holonyms: list[LinkageData] + hyphenation: list[str] + hypernyms: list[LinkageData] + hyponyms: list[LinkageData] + inflection_templates: list[TemplateData] + instances: list[LinkageData] + lang: str + lang_code: str + meronyms: list[LinkageData] + original_title: str + pos: str + proverbs: list[LinkageData] + redirects: list[str] + related: list[LinkageData] + senses: list[SenseData] + sounds: list[SoundData] + synonyms: list[LinkageData] + translations: list[TranslationData] + troponyms: list[LinkageData] + wikidata: list[str] + wikipedia: list[str] + word: str From 12427c6800ded4a1184110f188a1c3cd2d64894d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Thu, 25 Jan 2024 10:48:02 +0200 Subject: [PATCH 6/8] More typing stuff Sorry if it's messy, had to do some temp commits and I don't want to touch this anymore after amending this, just in case. --- src/wiktextract/clean.py | 238 ++++++++------ src/wiktextract/extractor/en/page.py | 205 +++++++----- src/wiktextract/extractor/ruby.py | 21 +- src/wiktextract/linkages.py | 463 +++++++++++++++++---------- src/wiktextract/page.py | 23 +- src/wiktextract/type_utils.py | 1 - 6 files changed, 585 insertions(+), 366 deletions(-) diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py index 7cb7e46da..1f078fefd 100644 --- a/src/wiktextract/clean.py +++ b/src/wiktextract/clean.py @@ -9,13 +9,12 @@ import re import html import unicodedata -from typing import ( - Callable, - Optional, - Union -) +from typing import Callable, Optional, Union from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST -from wikitextprocessor.core import NamespaceDataEntry +from wikitextprocessor.core import ( + NamespaceDataEntry, + TemplateArgs, +) from .wxr_context import WiktextractContext ###################################################################### @@ -94,7 +93,7 @@ "ι": "ᶥ", "φ": "ᵠ", "χ": "ᵡ", - "∞": "\u2002᪲" # This is a KLUDGE + "∞": "\u2002᪲", # This is a KLUDGE } subscript_ht: dict[str, str] = { @@ -137,6 +136,7 @@ "χ": "ᵪ", } + def to_superscript(text: str) -> str: "Converts text to superscript." if not text: @@ -147,6 +147,7 @@ def to_superscript(text: str) -> str: return "^" + text return "^({})".format(text) + def to_subscript(text: str) -> str: """Converts text to subscript.""" if not text: @@ -157,10 +158,11 @@ def to_subscript(text: str) -> str: return "_" + text return "_({})".format(text) + def to_chem(text: str) -> str: """Converts text to chemical formula, making digits subscript.""" - return "".join(to_subscript(x) if x.isdigit() else x - for x in text) + return "".join(to_subscript(x) if x.isdigit() else x for x in text) + # Mapping from Latex names to Unicode characters/strings. This is the # default mapping (some cases are handled specially in the code). @@ -886,7 +888,6 @@ def to_chem(text: str) -> str: "zpipe": "⨠", "zproject": "⨡", "|": "‖", - # Accents XXX these really should be handled specially with diacritics # after argument "acute": "́", @@ -906,8 +907,6 @@ def to_chem(text: str) -> str: "overline": "◌̅", "tilde": "̃", "vec": "⃑", - - # Some ignored operators "bigl": "", "bigr": "", @@ -973,7 +972,7 @@ def to_chem(text: str) -> str: "z": "𝓏", } -mathfrak_map: dict[str, str]= { +mathfrak_map: dict[str, str] = { "A": "𝔄", "B": "𝔅", "C": "ℭ", @@ -1070,15 +1069,19 @@ def to_chem(text: str) -> str: "9": "𝟡", } + def mathcal_fn(text: str) -> str: return "".join(mathcal_map.get(x, x) for x in text) + def mathfrak_fn(text: str) -> str: return "".join(mathfrak_map.get(x, x) for x in text) + def mathbb_fn(text: str) -> str: return "".join(mathbb_map.get(x, x) for x in text) + def to_math(text: str) -> str: """Converts a mathematical formula to ASCII.""" # print("to_math: {!r}".format(text)) @@ -1088,22 +1091,25 @@ def expand(text: str) -> str: while True: orig = text # formatting with {:c} converts input into character - text = re.sub(r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), - lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST], - text) + text = re.sub( + r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), + lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST], + text, + ) if text == orig: break return text def recurse(text: str) -> str: - def math_magic(text: str, - left: str, - right: str, - fn: Callable[[str], str] + def math_magic( + text: str, left: str, right: str, fn: Callable[[str], str] ) -> str: regexp_str = r"{}([^{}{}]+){}".format( - re.escape(left), re.escape(left), - re.escape(right), re.escape(right)) + re.escape(left), + re.escape(left), + re.escape(right), + re.escape(right), + ) regexp = re.compile(regexp_str) def repl(m: re.Match) -> str: @@ -1150,8 +1156,11 @@ def expand_group(v: str) -> str: elif re.match(r"\\sqrt($|[0-9]|\b)", v): v = "√" elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v): - m = re.match(r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*" - r"(\\[a-zA-Z]+|\\.|.)$", v) + m = re.match( + r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*" + r"(\\[a-zA-Z]+|\\.|.)$", + v, + ) if not m: print("MATH FRAC/BINOM ERROR: {!r}".format(v)) return v @@ -1198,31 +1207,37 @@ def expand_group(v: str) -> str: text = math_magic(text, "{", "}", recurse) if text == orig: break - for m in re.finditer(r"\s+|" - r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*" - r"(\\dot\\(bigvee|cup|cap|lor|vee)|" - r"\\not\\(subset|supset|subseteq|supseteq|in|ni|" - r"preceq|succeq|vartrianglelefteq|" - r"vartrianglerighteq|trianglelefteq|" - r"trianglerighteq)|" - r"\\widehat\{=\}|\\widehat=|" - r"\\overset\{?\}\{=\}|" - r"\\overset\?=|" - r"\\overset\{\\operatorname\{def\}\}\{=\}|" - r"\\[a-zA-Z]+|\\.|.)|" - r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)" - r"\b\s*|" - r"\\sqrt\b(\[\d+\])?)?" - r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)", text): + for m in re.finditer( + r"\s+|" + r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*" + r"(\\dot\\(bigvee|cup|cap|lor|vee)|" + r"\\not\\(subset|supset|subseteq|supseteq|in|ni|" + r"preceq|succeq|vartrianglelefteq|" + r"vartrianglerighteq|trianglelefteq|" + r"trianglerighteq)|" + r"\\widehat\{=\}|\\widehat=|" + r"\\overset\{?\}\{=\}|" + r"\\overset\?=|" + r"\\overset\{\\operatorname\{def\}\}\{=\}|" + r"\\[a-zA-Z]+|\\.|.)|" + r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)" + r"\b\s*|" + r"\\sqrt\b(\[\d+\])?)?" + r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)", + text, + ): v = m.group(0).strip() if not v: continue v = expand_group(v) if v: - if ((parts and parts[-1][-1].isalpha() and - v[0] in "0123456789") or - (parts and parts[-1][-1] in "0123456789" and - v[0] in "0123456789")): + if ( + parts and parts[-1][-1].isalpha() and v[0] in "0123456789" + ) or ( + parts + and parts[-1][-1] in "0123456789" + and v[0] in "0123456789" + ): v = " " + v parts.append(v) @@ -1237,7 +1252,7 @@ def expand_group(v: str) -> str: def bold_follows(parts: list[str], i: int) -> bool: """Checks if there is a bold (''') in parts after parts[i]. We allow intervening italics ('').""" - parts = parts[i + 1:] + parts = parts[i + 1 :] for p in parts: if not p.startswith("''"): continue @@ -1308,13 +1323,12 @@ def remove_italic_and_bold(text: str) -> str: continue new_text_parts.append(part) new_text_parts.append("\n") - new_text_parts = new_text_parts[:-1] # remove last \n + new_text_parts = new_text_parts[:-1] # remove last \n return "".join(new_text_parts) -def clean_value(wxr: WiktextractContext, - title: str, - no_strip=False, - no_html_strip=False + +def clean_value( + wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False ) -> str: """Cleans a title or value into a normal string. This should basically remove any Wikimedia formatting from it: HTML tags, templates, links, @@ -1334,17 +1348,18 @@ def repl_exturl(m: re.Match) -> str: break i += 1 return " ".join(args[i:]) + def repl_link(m: re.Match) -> str: if m.group(2) and m.group(2).lower() in ("file", "image"): return "" v = m.group(3).split("|") return clean_value(wxr, v[0], no_strip=True) + def repl_link_bars(m: re.Match) -> str: lnk = m.group(1) if re.match(r"(?si)(File|Image)\s*:", lnk): return "" - return clean_value(wxr, m.group(4) or m.group(2) or "", - no_strip=True) + return clean_value(wxr, m.group(4) or m.group(2) or "", no_strip=True) def repl_1_sup(m: re.Match) -> str: return to_superscript(clean_value(wxr, m.group(1))) @@ -1373,34 +1388,47 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: # Remove references (...). title = re.sub(r"(?is)/]*?>\s*.*?", "", title) # Replace ... by stripped content without newlines - title = re.sub(r"(?is)]*?>(.*?)\s*", - lambda m: re.sub(r"\s+", " ", m.group(1)), - title) + title = re.sub( + r"(?is)]*?>(.*?)\s*", + lambda m: re.sub(r"\s+", " ", m.group(1)), + title, + ) # Replace
by comma space (it is used to express alternatives in some # declensions) title = re.sub(r"(?si)\s*\n*", "\n", title) # Remove divs with floatright class (generated e.g. by {{ja-kanji|...}}) - title = re.sub(r'(?si)]*?\bclass="[^"]*?\bfloatright\b[^>]*?>' - r'((|.)*?)|.)*?' - r'', - "", title) + title = re.sub( + r'(?si)]*?\bclass="[^"]*?\bfloatright\b[^>]*?>' + r"((|.)*?)|.)*?" + r"", + "", + title, + ) # Remove divs with float: attribute - title = re.sub(r'(?si)]*?\bstyle="[^"]*?\bfloat:[^>]*?>' - r'((|.)*?)|.)*?' - r'', - "", title) + title = re.sub( + r'(?si)]*?\bstyle="[^"]*?\bfloat:[^>]*?>' + r"((|.)*?)|.)*?" + r"", + "", + title, + ) # Remove with previewonly class (generated e.g. by {{taxlink|...}}) - title = re.sub(r'(?si)]*?\bclass="[^"<>]*?' - r'\bpreviewonly\b[^>]*?>' - r'.+?', - "", title) + title = re.sub( + r'(?si)]*?\bclass="[^"<>]*?' + r"\bpreviewonly\b[^>]*?>" + r".+?", + "", + title, + ) # Remove ... - title = re.sub(r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>' - r'.+?', - "", title) + title = re.sub( + r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>' + r".+?", + "", + title, + ) # Change
and
to newlines. Ditto for tr, li, table, dl, ul, ol - title = re.sub(r"(?si)]*>", - "\n", title) + title = re.sub(r"(?si)]*>", "\n", title) # Change
,
, and
into newlines; # these generate new rows/lines. title = re.sub(r"(?i)", "\n", title) @@ -1408,22 +1436,20 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: title = re.sub(r"(?si)]*>", " ", title) # Change ... to ^ title = re.sub(r"(?si)]*>\s*", "", title) - title = re.sub(r"(?si)]*>(.*?)", - repl_1_sup, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_sup, title) # Change ... to _ title = re.sub(r"(?si)]*>\s*", "", title) - title = re.sub(r"(?si)]*>(.*?)", - repl_1_sub, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_sub, title) # Change ... using subscripts for digits - title = re.sub(r"(?si)]*>(.*?)", - repl_1_chem, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_chem, title) # Change ... using special formatting. - title = re.sub(r"(?si)]*>(.*?)", - repl_1_math, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_math, title) # Change ... using special formatting. - title = re.sub(r"(?si)]*>(.*?)" - r"", - repl_1_syntaxhighlight, title) + title = re.sub( + r"(?si)]*>(.*?)" r"", + repl_1_syntaxhighlight, + title, + ) # Remove any remaining HTML tags. if not no_html_strip: title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title) @@ -1441,7 +1467,7 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: category_ns_data: NamespaceDataEntry # XXX "Category" -> config variable for portability - category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item] + category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item] # Fail if we received empty dict from .get() category_ns_names = {category_ns_data["name"]} | set( category_ns_data["aliases"] @@ -1455,22 +1481,30 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: "", title, ) - title = re.sub(r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]", - repl_1, title) - title = re.sub(r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)" - r"\s*(#[^][|]*?)?\|?\]\]", - repl_link, title) - title = re.sub(r"(?s)\[\[\s*([^][|<>]+?)\s*\|" - r"\s*(([^][|]|\[[^]]*\])+?)" - r"(\s*\|\s*(([^]|]|\[[^]]*\])+?))*\s*\]\]", - repl_link_bars, title) + title = re.sub( + r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title + ) + title = re.sub( + r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)" + r"\s*(#[^][|]*?)?\|?\]\]", + repl_link, + title, + ) + title = re.sub( + r"(?s)\[\[\s*([^][|<>]+?)\s*\|" + r"\s*(([^][|]|\[[^]]*\])+?)" + r"(\s*\|\s*(([^]|]|\[[^]]*\])+?))*\s*\]\]", + repl_link_bars, + title, + ) if title == orig: break # Replace remaining HTML links by the URL. while True: orig = title - title = re.sub(r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", - repl_exturl, title) + title = re.sub( + r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title + ) if title == orig: break @@ -1508,14 +1542,16 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: return title -def clean_template_args(wxr: WiktextractContext, - ht: dict[Union[int, str], str], # XXX -> "TemplateArgs" - no_strip=False +def clean_template_args( + wxr: WiktextractContext, ht: TemplateArgs, no_strip=False ) -> dict[str, str]: """Cleans all values in a template argument dictionary and returns the cleaned dictionary.""" assert isinstance(wxr, WiktextractContext) assert isinstance(ht, dict) - return {clean_value(wxr, str(k), no_html_strip=True): - clean_value(wxr, str(v), no_strip=no_strip, no_html_strip=True) - for k, v in ht.items()} + return { + clean_value(wxr, str(k), no_html_strip=True): clean_value( + wxr, str(v), no_strip=no_strip, no_html_strip=True + ) + for k, v in ht.items() + } diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 2a0a0408a..f7286c46d 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -11,14 +11,22 @@ from functools import partial from re import Pattern from typing import ( + TYPE_CHECKING, + Callable, Optional, Set, Union, + cast, ) from mediawiki_langcodes import get_all_names, name_to_code from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.core import TemplateArgs +from wikitextprocessor.core import ( + TemplateArgs, + TemplateFnCallable, + PostTemplateFnCallable, +) +from wikitextprocessor.parser import GeneralNode from wiktextract.clean import clean_template_args from wiktextract.datautils import ( data_append, @@ -44,7 +52,11 @@ from wiktextract.parts_of_speech import PARTS_OF_SPEECH from wiktextract.tags import valid_tags from wiktextract.translations import parse_translation_item_text -from wiktextract.type_utils import WordData +from wiktextract.type_utils import ( + SenseData, + SoundData, + WordData, +) from wiktextract.wxr_context import WiktextractContext from ..ruby import extract_ruby, parse_ruby @@ -666,7 +678,7 @@ def decode_html_entities(v: Union[str, int]) -> str: def parse_sense_linkage( wxr: WiktextractContext, - data: WordData, + data: SenseData, name: str, ht: TemplateArgs, ) -> None: @@ -681,7 +693,7 @@ def parse_sense_linkage( w = clean_node(wxr, data, w) for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): if w.startswith(alias): - w = w[len(alias):] + w = w[len(alias) :] break if not w: break @@ -761,13 +773,17 @@ def parse_language( word = word[word.find("/") + 1 :] is_reconstruction = True - base_data = {"word": word, "lang": language, "lang_code": lang_code} + base_data: WordData = { + "word": word, + "lang": language, + "lang_code": lang_code, + } if is_reconstruction: data_append(base_data, "tags", "reconstruction") - sense_data: WordData = {} + sense_data: SenseData = {} pos_data: WordData = {} # For a current part-of-speech etym_data: WordData = {} # For one etymology - pos_datas: list[WordData] = [] + pos_datas: list[SenseData] = [] etym_datas: list[WordData] = [] page_datas: list[WordData] = [] have_etym = False @@ -780,49 +796,54 @@ def merge_base(data: WordData, base: WordData) -> None: v = copy.deepcopy(v) if k not in data: # The list was copied above, so this will not create shared ref - data[k] = v + data[k] = v # type: ignore[literal-required] continue - if data[k] == v: + if data[k] == v: # type: ignore[literal-required] continue - if isinstance(data[k], (list, tuple)) or isinstance( - v, (list, tuple) # Should this be "and"? + if ( + isinstance(data[k], (list, tuple)) # type: ignore[literal-required] + or isinstance( + v, + (list, tuple), # Should this be "and"? + ) ): data[k] = list(data[k]) + list(v) # type: ignore - elif data[k] != v: + elif data[k] != v: # type: ignore[literal-required] wxr.wtp.warning( "conflicting values for {} in merge_base: " - "{!r} vs {!r}".format(k, data[k], v), + "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] sortid="page/904", ) - def complementary_pop(pron: WordData, key: str - ) -> WordData: + def complementary_pop(pron: SoundData, key: str) -> SoundData: """Remove unnecessary keys from dict values in a list comprehension...""" if key in pron: - pron.pop(key) + pron.pop(key) # type: ignore return pron # If the result has sounds, eliminate sounds that have a prefix that # does not match "word" or one of "forms" if "sounds" in data and "word" in data: accepted = [data["word"]] - accepted.extend(f["form"] for f in data.get("forms", dict())) # type:ignore + accepted.extend(f["form"] for f in data.get("forms", dict())) data["sounds"] = list( s - for s in data["sounds"] # type:ignore - if "form" not in s or s["form"] in accepted # type:ignore + for s in data["sounds"] + if "form" not in s or s["form"] in accepted ) # If the result has sounds, eliminate sounds that have a pos that # does not match "pos" if "sounds" in data and "pos" in data: data["sounds"] = list( - complementary_pop(s, "pos") # type:ignore - for s in data["sounds"] # type: ignore - if "pos" not in s or s["pos"] == data["pos"] # type:ignore + complementary_pop(s, "pos") + for s in data["sounds"] + # "pos" is not a field of SoundData, correctly, so we're + # removing it here. It's a kludge on a kludge on a kludge. + if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] ) - def push_sense(): + def push_sense() -> bool: """Starts collecting data for a new word sense. This returns True if a sense was added.""" nonlocal sense_data @@ -865,20 +886,20 @@ def push_sense(): sense_data = {} return True - def push_pos(): + def push_pos() -> None: """Starts collecting data for a new part-of-speech.""" nonlocal pos_data nonlocal pos_datas push_sense() if wxr.wtp.subsection: - data = {"senses": pos_datas} + data: WordData = {"senses": pos_datas} merge_base(data, pos_data) etym_datas.append(data) pos_data = {} pos_datas = [] wxr.wtp.start_subsection(None) - def push_etym(): + def push_etym() -> None: """Starts collecting data for a new etymology.""" nonlocal etym_data nonlocal etym_datas @@ -891,7 +912,7 @@ def push_etym(): etym_data = {} etym_datas = [] - def select_data(): + def select_data() -> WordData: """Selects where to store data (pos or etym) based on whether we are inside a pos (part-of-speech).""" if wxr.wtp.subsection is not None: @@ -900,7 +921,9 @@ def select_data(): return base_data return etym_data - def head_post_template_fn(name, ht, expansion): + def head_post_template_fn( + name: str, ht: TemplateArgs, expansion: str + ) -> str: """Handles special templates in the head section of a word. Head section is the text after part-of-speech subtitle and before word sense list. Typically it generates the bold line for the word, but @@ -932,7 +955,7 @@ def head_post_template_fn(name, ht, expansion): # Note: various places expect to have content from wikipedia # templates, so cannot convert this to empty parse_wikipedia_template(wxr, pos_data, ht) - return None + return "" if name == "number box": # XXX extract numeric value? @@ -960,17 +983,17 @@ def head_post_template_fn(name, ht, expansion): # XXX extract? return "" - return None + return "" - def parse_part_of_speech(posnode, pos): + def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: """Parses the subsection for a part-of-speech under a language on a page.""" assert isinstance(posnode, WikiNode) assert isinstance(pos, str) # print("parse_part_of_speech", pos) pos_data["pos"] = pos - pre = [[]] # list of lists - lists = [[]] # list of lists + pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists + lists: list[list[WikiNode]] = [[]] # list of lists first_para = True first_head_tmplt = True collecting_head = True @@ -999,7 +1022,7 @@ def parse_part_of_speech(posnode, pos): ), ) tempnode = WikiNode(NodeKind.LEVEL5, 0) - tempnode.largs = ["Inflection"] + tempnode.largs = [["Inflection"]] tempnode.children = floaters parse_inflection(tempnode, "Floating Div", pos) # print(poschildren) @@ -1096,7 +1119,9 @@ def parse_part_of_speech(posnode, pos): # skip these templates; panel_templates is already used # to skip certain templates else, but it also applies to # head parsing quite well. - if is_panel_template(wxr, node.largs[0][0]): + # node.largs[0][0] should always be str, but can't type-check + # that. + if is_panel_template(wxr, node.largs[0][0]): # type: ignore[arg-type] continue # skip these templates # if node.largs[0][0] in skip_these_templates_in_head: @@ -1127,8 +1152,8 @@ def parse_part_of_speech(posnode, pos): # Clean up empty pairs, and fix messes with extra newlines that # separate templates that are followed by lists wiktextract issue #314 - cleaned_pre = [] - cleaned_lists = [] + cleaned_pre: list[list[Union[str, WikiNode]]] = [] + cleaned_lists: list[list[WikiNode]] = [] pairless_pre_index = None for pre1, ls in zip(pre, lists): @@ -1154,7 +1179,7 @@ def parse_part_of_speech(posnode, pos): lists = cleaned_lists there_are_many_heads = len(pre) > 1 - header_tags = [] + header_tags: list[str] = [] if not any(g for g in lists): process_gloss_without_list(poschildren, pos, pos_data, header_tags) @@ -1245,10 +1270,10 @@ def parse_part_of_speech(posnode, pos): # the data is already pushed into a sub-gloss # downstream, unless the higher level has examples # that need to be put somewhere. - common_data = {"tags": list(header_tags)} + common_data: SenseData = {"tags": list(header_tags)} if head_group: common_data["head_nr"] = head_group - parse_sense_node(node, common_data, pos) + parse_sense_node(node, common_data, pos) # type: ignore[arg-type] # If there are no senses extracted, add a dummy sense. We want to # keep tags extracted from the head for the dummy sense. @@ -1262,7 +1287,7 @@ def process_gloss_header( header_nodes: list[Union[WikiNode, str]], pos_type: str, header_group: Optional[int], - pos_data: dict, + pos_data: WordData, header_tags: list[str], ) -> None: ruby = [] @@ -1278,6 +1303,10 @@ def process_gloss_header( ) if rub is not None: for r in rub: + if TYPE_CHECKING: + # we know the lambda above in recursively_extract + # returns only WikiNodes in rub + assert isinstance(r, WikiNode) rt = parse_ruby(wxr, r) if rt is not None: ruby.append(rt) @@ -1295,24 +1324,28 @@ def process_gloss_header( ruby=ruby, ) if "tags" in pos_data: - header_tags[:] = pos_data["tags"] - del pos_data["tags"] + # pos_data can get "tags" data from some source; type-checkers + # doesn't like it, so let's ignore it. + header_tags[:] = pos_data["tags"] # type: ignore[typeddict-item] + del pos_data["tags"] # type: ignore[typeddict-item] else: header_tags.clear() def process_gloss_without_list( nodes: list[Union[WikiNode, str]], pos_type: str, - pos_data: dict, + pos_data: WordData, header_tags: list[str], ) -> None: # gloss text might not inside a list - header_nodes = [] - gloss_nodes = [] + header_nodes: list[Union[str, WikiNode]] = [] + gloss_nodes: list[Union[str, WikiNode]] = [] for node in strip_nodes(nodes): if isinstance(node, WikiNode): if node.kind == NodeKind.TEMPLATE: template_name = node.largs[0][0] + if TYPE_CHECKING: + assert isinstance(template_name, str) if template_name == "head" or template_name.startswith( f"{lang_code}-" ): @@ -1331,7 +1364,11 @@ def process_gloss_without_list( gloss_nodes, pos_type, {"tags": list(header_tags)} ) - def parse_sense_node(node, sense_base, pos): + def parse_sense_node( + node: Union[str, WikiNode], # never receives str + sense_base: SenseData, + pos: str, + ) -> bool: """Recursively (depth first) parse LIST_ITEM nodes for sense data. Uses push_sense() to attempt adding data to pos_data in the scope of parse_language() when it reaches deep in the recursion. push_sense() @@ -1342,6 +1379,7 @@ def parse_sense_node(node, sense_base, pos): """ assert isinstance(sense_base, dict) # Added to every sense deeper in if not isinstance(node, WikiNode): + # This doesn't seem to ever happen in practice. wxr.wtp.debug( "{}: parse_sense_node called with" "something that isn't a WikiNode".format(pos), @@ -1368,7 +1406,7 @@ def parse_sense_node(node, sense_base, pos): # added |= push_sense() or added |= parse_sense_node(...) to OR. added = False - gloss_template_args = set() + gloss_template_args: set[str] = set() # For LISTs and LIST_ITEMS, their argument is something like # "##" or "##:", and using that we can rudimentally determine @@ -1470,7 +1508,7 @@ def parse_sense_node(node, sense_base, pos): def process_gloss_contents( contents: list[Union[str, WikiNode]], pos: str, - sense_base: dict, + sense_base: SenseData, subentries: list[WikiNode] = [], others: list[WikiNode] = [], gloss_template_args: Set[str] = set(), @@ -1544,7 +1582,7 @@ def sense_template_fn( gloss_template_args.add(v) return None - def extract_link_texts(item): + def extract_link_texts(item: GeneralNode) -> None: """Recursively extracts link texts from the gloss source. This information is used to select whether to remove final "." from form_of/alt_of (e.g., ihm/Hunsrik).""" @@ -1578,11 +1616,16 @@ def extract_link_texts(item): # get the raw text of non-list contents of this node, and other stuff # like tag and category data added to sense_base + # cast = no-op type-setter for the type-checker + partial_template_fn = cast( + TemplateFnCallable, + partial(sense_template_fn, is_gloss=True), + ) rawgloss = clean_node( wxr, sense_base, contents, - template_fn=partial(sense_template_fn, is_gloss=True), + template_fn=partial_template_fn, collect_links=True, ) @@ -1633,12 +1676,12 @@ def extract_link_texts(item): parse_sense_qualifier(wxr, q, sense_base) if rawgloss == "A pejorative:": data_append(sense_base, "tags", "pejorative") - rawgloss = None + rawgloss = "" elif rawgloss == "Short forms.": data_append(sense_base, "tags", "abbreviation") - rawgloss = None + rawgloss = "" elif rawgloss == "Technical or specialized senses.": - rawgloss = None + rawgloss = "" if rawgloss: data_append(sense_base, "glosses", rawgloss) if rawgloss in ("A person:",): @@ -1747,7 +1790,7 @@ def extract_link_texts(item): data_extend(sense_data, k, v) else: assert k not in ("tags", "categories", "topics") - sense_data[k] = v + sense_data[k] = v # type:ignore[literal-required] # Parse the gloss for this particular sense m = re.match(r"^\((([^()]|\([^()]*\))*)\):?\s*", gloss) # (...): ... or (...(...)...): ... @@ -1878,7 +1921,7 @@ def extract_link_texts(item): data_append(sense_data, "form_of", dt) if len(sense_data) == 0: - if len(sense_base.get("tags")) == 0: + if len(sense_base.get("tags", [])) == 0: del sense_base["tags"] sense_data.update(sense_base) if push_sense(): @@ -1887,7 +1930,9 @@ def extract_link_texts(item): # print("PARSE_SENSE DONE:", pos_datas[-1]) return added - def parse_inflection(node, section, pos): + def parse_inflection( + node: WikiNode, section: str, pos: Optional[str] + ) -> None: """Parses inflection data (declension, conjugation) from the given page. This retrieves the actual inflection template parameters, which are very useful for applications that need @@ -1904,7 +1949,9 @@ def parse_inflection(node, section, pos): ) return - def inflection_template_fn(name, ht): + def inflection_template_fn( + name: str, ht: TemplateArgs + ) -> Optional[str]: # print("decl_conj_template_fn", name, ht) if is_panel_template(wxr, name): return "" @@ -1945,7 +1992,7 @@ def inflection_template_fn(name, ht): # print(text) # print(repr(brace_matches)) if len(brace_matches) > 1: - tsection = [] + tsection: list[str] = [] after_templates = False # kludge to keep any text # before first template # with the first template; @@ -2024,7 +2071,9 @@ def inflection_template_fn(name, ht): tablecontext=tablecontext, ) - def get_subpage_section(title, subtitle, seq): + def get_subpage_section( + title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]] + ) -> Optional[Union[WikiNode, str]]: """Loads a subpage of the given page, and finds the section for the given language, part-of-speech, and section title. This is used for finding translations and other sections on subpages.""" @@ -2042,8 +2091,11 @@ def get_subpage_section(title, subtitle, seq): "{{see translation subpage|...}}", sortid="page/1934", ) + return None - def recurse(node, seq): + def recurse( + node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] + ) -> Optional[Union[str, WikiNode]]: # print(f"seq: {seq}") if not seq: return node @@ -2080,7 +2132,9 @@ def recurse(node, seq): ) return ret - def parse_linkage(data, field, linkagenode): + def parse_linkage( + data: WordData, field: str, linkagenode: WikiNode + ) -> None: assert isinstance(data, dict) assert isinstance(field, str) assert isinstance(linkagenode, WikiNode) @@ -2095,7 +2149,11 @@ def parse_linkage(data, field, linkagenode): toplevel_text = [] next_navframe_sense = None # Used for "(sense):" before NavFrame - def parse_linkage_item(contents, field, sense): + def parse_linkage_item( + contents: list[Union[str, WikiNode]], + field: str, + sense: Optional[str] = None, + ): assert isinstance(contents, (list, tuple)) assert isinstance(field, str) assert sense is None or isinstance(sense, str) @@ -2103,11 +2161,13 @@ def parse_linkage_item(contents, field, sense): # print("PARSE_LINKAGE_ITEM: {} ({}): {}" # .format(field, sense, contents)) - parts = [] - ruby = [] - urls = [] + parts: list[str] = [] + ruby: list[tuple[str, str]] = [] + urls: list[str] = [] - def item_recurse(contents, italic=False): + def item_recurse( + contents: list[Union[str, WikiNode]], italic=False + ) -> None: assert isinstance(contents, (list, tuple)) nonlocal sense nonlocal ruby @@ -2122,6 +2182,7 @@ def item_recurse(contents, italic=False): # node.sarg if node.sarg else node.largs) if kind == NodeKind.LIST: if parts: + sense1: Optional[str] sense1 = clean_node(wxr, None, parts) if sense1.endswith(":"): sense1 = sense1[:-1].strip() @@ -2174,8 +2235,8 @@ def item_recurse(contents, italic=False): elif kind == NodeKind.LINK: ignore = False if isinstance(node.largs[0][0], str): - v = node.largs[0][0].strip().lower() - if v.startswith( + v1 = node.largs[0][0].strip().lower() + if v1.startswith( ns_title_prefix_tuple(wxr, "Category", True) + ns_title_prefix_tuple(wxr, "File", True) ): @@ -2188,16 +2249,16 @@ def item_recurse(contents, italic=False): and isinstance(v[0], str) and v[0][0] == ":" ): - v = [v[0][1:]] + list(v[1:]) + v = [v[0][1:]] + list(v[1:]) # type:ignore item_recurse(v, italic=italic) elif kind == NodeKind.URL: if len(node.largs) < 2 and node.largs: # Naked url captured - urls.extend(node.largs[-1]) + urls.extend(node.largs[-1]) # type:ignore[arg-type] continue if len(node.largs) == 2: # Url from link with text - urls.append(node.largs[0][-1]) + urls.append(node.largs[0][-1]) # type:ignore[arg-type] # print(f"{node.largs=!r}") # print("linkage recurse URL {}".format(node)) item_recurse(node.largs[-1], italic=italic) diff --git a/src/wiktextract/extractor/ruby.py b/src/wiktextract/extractor/ruby.py index 43e2ee38f..1a287758c 100644 --- a/src/wiktextract/extractor/ruby.py +++ b/src/wiktextract/extractor/ruby.py @@ -1,8 +1,12 @@ from typing import List, Optional, Tuple, Union from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode - +from wikitextprocessor.parser import ( + GeneralNode, + HTMLNode, + LevelNode, + TemplateNode, +) from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -13,8 +17,9 @@ def parse_ruby( """Parse a HTML 'ruby' node for a kanji part and a furigana (ruby) part, and return a tuple containing those. Discard the rp-element's parentheses, we don't do anything with them.""" - ruby_nodes = [] - furi_nodes = [] + ruby_nodes: list[Union[str, WikiNode]] = [] + furi_nodes: list[Union[str, WikiNode]] = [] # furi_nodes is technically + # just list[WikiNode], but this appeases the type-checker for clean_node() for child in node.children: if ( not isinstance(child, WikiNode) @@ -31,14 +36,14 @@ def parse_ruby( # element with an empty something (apparently, seeing as how this # works), leaving no trace of the broken ruby element in the final # HTML source of the page! - return + return None return ruby_kanji, furigana def extract_ruby( wxr: WiktextractContext, - contents: Union[WikiNode, List[Union[WikiNode, str]]], -) -> Tuple[List[Tuple[str]], List[Union[WikiNode, str]]]: + contents: GeneralNode, +) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]: # If contents is a list, process each element separately extracted = [] new_contents = [] @@ -69,7 +74,7 @@ def extract_ruby( }: # Process args and children if kind != NodeKind.LINK: - new_node = LevelNode(new_node.loc) + new_node = LevelNode(kind, new_node.loc) new_args = [] for arg in contents.largs: e1, c1 = extract_ruby(wxr, arg) diff --git a/src/wiktextract/linkages.py b/src/wiktextract/linkages.py index 5efbbea42..0de6f427d 100644 --- a/src/wiktextract/linkages.py +++ b/src/wiktextract/linkages.py @@ -8,21 +8,33 @@ from wikitextprocessor import Wtp from typing import Dict, List, Union, Optional from .datautils import split_at_comma_semi, data_append -from .form_descriptions import (classify_desc, parse_head_final_tags, - parse_sense_qualifier, - head_final_bantu_langs, head_final_bantu_re, - head_final_other_langs, head_final_other_re, - head_final_numeric_langs, head_final_re) +from .form_descriptions import ( + classify_desc, + parse_head_final_tags, + parse_sense_qualifier, + head_final_bantu_langs, + head_final_bantu_re, + head_final_other_langs, + head_final_other_re, + head_final_numeric_langs, + head_final_re, +) from .tags import linkage_beginning_tags +from .type_utils import WordData # Linkage will be ignored if it matches this regexp before splitting linkage_pre_split_ignore_re = re.compile( - r"^(" + "|".join(re.escape(x) for x in [ - "For more variations, see ", - "Signal flag:", - "Semaphore:", - ]) + - r")") + r"^(" + + "|".join( + re.escape(x) + for x in [ + "For more variations, see ", + "Signal flag:", + "Semaphore:", + ] + ) + + r")" +) # Linkage will be ignored if it has one of these prefixes linkage_ignore_prefixes = [ @@ -63,31 +75,40 @@ # Linkage will be ignored if it matches this regexp linkage_ignore_re = re.compile( - r"^(" + "|".join(re.escape(x) for x in linkage_ignore_whole) + - r")$|^(" + "|".join(re.escape(x) for x in linkage_ignore_prefixes) + - r")|(" + "|".join(re.escape(x) for x in linkage_ignore_suffixes) + - r")$") + r"^(" + + "|".join(re.escape(x) for x in linkage_ignore_whole) + + r")$|^(" + + "|".join(re.escape(x) for x in linkage_ignore_prefixes) + + r")|(" + + "|".join(re.escape(x) for x in linkage_ignore_suffixes) + + r")$" +) # These prefixes will be removed from linkages, leaving the rest. This is # considered separately for each linkage in a list. linkage_remove_prefixes_re = re.compile( - r"^(" + - r"|".join(re.escape(x) for x in [ - ":", - "see Thesaurus:", - "See Thesaurus:", - "see also Thesaurus:", - "See also Thesaurus:", - "see also ", - "See also ", - "see ", - "See ", - "from ", - "abbreviation of ", - "ISO 639-1 code ", - "ISO 639-3 code ", - "Thesaurus:"]) + - ")") + r"^(" + + r"|".join( + re.escape(x) + for x in [ + ":", + "see Thesaurus:", + "See Thesaurus:", + "see also Thesaurus:", + "See also Thesaurus:", + "see also ", + "See also ", + "see ", + "See ", + "from ", + "abbreviation of ", + "ISO 639-1 code ", + "ISO 639-3 code ", + "Thesaurus:", + ] + ) + + ")" +) # When removing prefix from linkage, this dictionary can be used to map # the removed prefix to a space-separated list of tags to add @@ -101,17 +122,22 @@ r"(\s+on (Wikispecies|Wikimedia Commons|" r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|" r"\s*[-–] Pre-reform orthography.*)" - r"$") + r"$" +) # Ignore linkage parenthesized sections that contain one of these strings linkage_paren_ignore_contains_re = re.compile( - r"\b(" + - "|".join(re.escape(x) for x in [ - "from Etymology", - "used as", - "usage notes", - ]) + - ")([, ]|$)") + r"\b(" + + "|".join( + re.escape(x) + for x in [ + "from Etymology", + "used as", + "usage notes", + ] + ) + + ")([, ]|$)" +) taxonomic_ending_map = { "superkingdoms": "superkingdom", @@ -133,7 +159,9 @@ taxonomic_ending_map[v] = v # Also add singular -> singular taxonomic_ending_re = re.compile( r"\s+[-‐‑‒–—]\s+({})$".format( - "|".join(re.escape(x) for x in taxonomic_ending_map))) + "|".join(re.escape(x) for x in taxonomic_ending_map) + ) +) # Exceptional splits for linkages. This can be used to fix particular linkages # that are not handled correctly by the default code. This can also be used @@ -146,10 +174,14 @@ # Truncate linkage word if it matches any of these strings linkage_truncate_re = re.compile( - "|".join(re.escape(x) for x in [ - " and its derived terms", - " UTF-16 0x214C", - ])) + "|".join( + re.escape(x) + for x in [ + " and its derived terms", + " UTF-16 0x214C", + ] + ) +) # Regexp for identifying special linkages containing lists of letters, digits, # or characters @@ -161,39 +193,47 @@ r" digits)(;|$)|" r"(^|; )(Letters using |Letters of the |" r"Variations of letter )|" - r"^(Hiragana|Katakana)$") + r"^(Hiragana|Katakana)$" +) # Matches an unicode character including any combining diacritics (even if # separate characters) -unicode_dc_re = re.compile(r"\w[{}]|.".format( - "".join(chr(x) for x in range(0, 0x110000) - if unicodedata.category(chr(x)) == "Mn"))) - - -def parse_linkage_item_text(wxr: Wtp, - word: str, - data: Dict[str, Union[list, str, dict]], - field: str, - item: str, - sense: Optional[str], - ruby: list, - pos_datas: list, - is_reconstruction: bool, - urls: Optional[List[str]] = None - ) -> Optional[str]: +unicode_dc_re = re.compile( + r"\w[{}]|.".format( + "".join( + chr(x) + for x in range(0, 0x110000) + if unicodedata.category(chr(x)) == "Mn" + ) + ) +) + + +def parse_linkage_item_text( + wxr: WiktextractContext, + word: str, + data: WordData, + field: str, + item: str, + sense: Optional[str], + ruby: list, + pos_datas: list, + is_reconstruction: bool, + urls: Optional[List[str]] = None, +) -> Optional[str]: """Parses a linkage item once it has been converted to a string. This may add one or more linkages to ``data`` under ``field``. This returns None or a string that contains thats that should be applied to additional linkages (commonly used in tables for Asian characters).""" assert isinstance(wxr, WiktextractContext) - assert isinstance(word, str) # Main word (derived from page title) + assert isinstance(word, str) # Main word (derived from page title) assert isinstance(data, dict) # Parsed linkages are stored here under field assert isinstance(field, str) # The field under which to store linkage - assert isinstance(item, str) # The string to parse + assert isinstance(item, str) # The string to parse assert sense is None or isinstance(sense, str) - assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" + assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" assert isinstance(pos_datas, list) # List of senses (containing "glosses") - assert urls is None or isinstance(urls, list) # Captured urls + assert urls is None or isinstance(urls, list) # Captured urls assert is_reconstruction in (True, False) item = item.replace("()", "") @@ -229,7 +269,7 @@ def parse_linkage_item_text(wxr: Wtp, # Replace occurrences of ~ in the item by the page title safetitle = wxr.wtp.title.replace("\\", "\\\\") - item = item.replace(" ~ ", " " + safetitle + " ") + item = item.replace(" ~ ", " " + safetitle + " ") item = re.sub(r"^~ ", safetitle + " ", item) item = re.sub(r" ~$", " " + safetitle, item) @@ -239,7 +279,7 @@ def parse_linkage_item_text(wxr: Wtp, m = re.search(taxonomic_ending_re, item) if m: base_english = taxonomic_ending_map[m.group(1)] - item = item[:m.start()] + item = item[: m.start()] # Some Korean and Japanese words use "word (romanized): english" pattern # Sometimes the parenthesized part contains comma-separated alt and roman. @@ -248,13 +288,17 @@ def parse_linkage_item_text(wxr: Wtp, rom = m.group(2) eng = m.group(3) rest = m.group(1) - if (classify_desc(rest, no_unknown_starts=True) == "other" and - classify_desc(eng, no_unknown_starts=True) == "english"): + if ( + classify_desc(rest, no_unknown_starts=True) == "other" + and classify_desc(eng, no_unknown_starts=True) == "english" + ): item = rest base_roman = rom lst = base_roman.split(", ") - if (len(lst) == 2 and - classify_desc(lst[0], no_unknown_starts=True) == "other"): + if ( + len(lst) == 2 + and classify_desc(lst[0], no_unknown_starts=True) == "other" + ): base_alt = lst[0] base_roman = lst[1] if base_english: @@ -265,9 +309,10 @@ def parse_linkage_item_text(wxr: Wtp, # Many words have tags or similar descriptions in the beginning # followed by a colon and one or more linkages (e.g., # panetella/Finnish) - m = (re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or - re.match(r"^([a-zA-Z][-'a-zA-Z0-9 ]*" - r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", item)) + m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match( + r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", + item, + ) if m: desc = m.group(1) rest = m.group(len(m.groups())) @@ -326,12 +371,22 @@ def parse_linkage_item_text(wxr: Wtp, e1 = wxr.wtp.page_exists(desc) e2 = wxr.wtp.page_exists(rest) if cls != "tags": - if (cls2 == "tags" or - (e1 and not e1) or - (e1 and e2 and cls2 == "english" and - cls in ("other", "romanization")) or - (not e1 and not e2 and cls2 == "english" and - cls in ("other", "romanization"))): + if ( + cls2 == "tags" + or (e1 and not e1) + or ( + e1 + and e2 + and cls2 == "english" + and cls in ("other", "romanization") + ) + or ( + not e1 + and not e2 + and cls2 == "english" + and cls in ("other", "romanization") + ) + ): desc, rest = rest, desc # Looks like swapped syntax cls = cls2 if re.search(linkage_paren_ignore_contains_re, desc): @@ -364,48 +419,56 @@ def parse_linkage_item_text(wxr: Wtp, d = pos_datas[idx] gl = "; ".join(d.get("glosses", ())) if not gl: - wxr.wtp.debug("parenthesized numeric linkage prefix, " - "but the referenced sense has no gloss: " - "{}".format(desc), - sortid="linkages/355") + wxr.wtp.debug( + "parenthesized numeric linkage prefix, " + "but the referenced sense has no gloss: " + "{}".format(desc), + sortid="linkages/355", + ) elif sense: sense += "; " + gl else: sense = gl item = rest else: - wxr.wtp.debug("parenthesized numeric linkage prefix, " - "but there is no sense with such index: {}" - .format(desc), - sortid="linkages/365") + wxr.wtp.debug( + "parenthesized numeric linkage prefix, " + "but there is no sense with such index: {}".format(desc), + sortid="linkages/365", + ) item = rest else: - wxr.wtp.debug("unrecognized linkage prefix: {} desc={} rest={} " - "cls={} cls2={} e1={} e2={}" - .format(item, desc, rest, cls, cls2, e1, e2), - sortid="linkages/371") + wxr.wtp.debug( + "unrecognized linkage prefix: {} desc={} rest={} " + "cls={} cls2={} e1={} e2={}".format( + item, desc, rest, cls, cls2, e1, e2 + ), + sortid="linkages/371", + ) item = rest base_sense = sense # Check for certain plural tag forms at end of items list, and apply # them to all items if found - m = re.search(r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" - r"characters|symbols|tetragrams|letter names|names|" - r"female names|male names|proper nouns|contractions|" - r"nonstandard spellings|verbs|prepositions|postpositions|" - r"interjections|Abbreviations|abbreviations|variants|" - r"ordinals|nouns|phrases|adjectives|adverbs|" - r"augmentatives|pejoratives|compound words|numerals|" - r"Tally marks|surnames|modern nonstandard spellings)$", - item) + m = re.search( + r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" + r"characters|symbols|tetragrams|letter names|names|" + r"female names|male names|proper nouns|contractions|" + r"nonstandard spellings|verbs|prepositions|postpositions|" + r"interjections|Abbreviations|abbreviations|variants|" + r"ordinals|nouns|phrases|adjectives|adverbs|" + r"augmentatives|pejoratives|compound words|numerals|" + r"Tally marks|surnames|modern nonstandard spellings)$", + item, + ) if m: suffix = m.group(1) if base_qualifier: base_qualifier += ", " + suffix else: base_qualifier = suffix - item = item[:m.start()] + item = item[: m.start()] # Certain linkage items have space-separated valus. These are # generated by, e.g., certain templates @@ -443,17 +506,29 @@ def parse_linkage_item_text(wxr: Wtp, # Item1 contains " or " item2 = re.sub(r"\s*\([^)]*\)", "", item1) item2 = re.sub(r"\s+", " ", item2) - if ((lang not in head_final_bantu_langs or - not re.search(head_final_bantu_re, item2)) and - (lang not in head_final_other_langs or - not re.search(head_final_other_re, item2)) and - (not re.search(head_final_re, item2) or - (item2[-1].isdigit() and - lang not in head_final_numeric_langs)) and - not re.search(r"\bor\b", wxr.wtp.title) and - all(wxr.wtp.title not in x.split(" or ") + if ( + ( + lang not in head_final_bantu_langs + or not re.search(head_final_bantu_re, item2) + ) + and ( + lang not in head_final_other_langs + or not re.search(head_final_other_re, item2) + ) + and ( + not re.search(head_final_re, item2) + or ( + item2[-1].isdigit() + and lang not in head_final_numeric_langs + ) + ) + and not re.search(r"\bor\b", wxr.wtp.title) + and all( + wxr.wtp.title not in x.split(" or ") for x in split_at_comma_semi(item2) - if " or " in x)): + if " or " in x + ) + ): # We can split this item. Split the non-cleaned version # that still has any intervening parenthesized parts. subitems.extend(split_at_comma_semi(item1, extra=[" or "])) @@ -482,7 +557,7 @@ def parse_linkage_item_text(wxr: Wtp, m = re.search(r"\s*\(“([^”]+)”\)", item1) if m: t = m.group(1) - item1 = (item1[:m.start()] + item1[m.end():]).strip() + item1 = (item1[: m.start()] + item1[m.end() :]).strip() cls = classify_desc(t) if cls == "tags": if qualifier: @@ -494,20 +569,27 @@ def parse_linkage_item_text(wxr: Wtp, # Some Korean words use "word (alt, oman, “english”) pattern # See 滿/Korean - m = re.match(r'([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), ' - r'[“”"]([^”“"]+)[“”"]\)$', item1) - if (m and - classify_desc(m.group(1), no_unknown_starts=True) == "other" and - classify_desc(m.group(2), no_unknown_starts=True) == "other"): + m = re.match( + r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), " + r'[“”"]([^”“"]+)[“”"]\)$', + item1, + ) + if ( + m + and classify_desc(m.group(1), no_unknown_starts=True) == "other" + and classify_desc(m.group(2), no_unknown_starts=True) == "other" + ): alt = m.group(2) roman = m.group(3) english = m.group(4) item1 = m.group(1) words = item1.split(" ") - if (len(words) > 1 and - words[0] in linkage_beginning_tags and - words[0] != wxr.wtp.title): + if ( + len(words) > 1 + and words[0] in linkage_beginning_tags + and words[0] != wxr.wtp.title + ): t = linkage_beginning_tags[words[0]] item1 = " ".join(words[1:]) if qualifier: @@ -543,8 +625,9 @@ def english_repl(m): # sometimes both at the beginning and at the end. # And sometimes even in the middle, as in e.g. # wife/English/Translations/Yiddish - while (not script_chars and - (not sense or not re.search(script_chars_re, sense))): + while not script_chars and ( + not sense or not re.search(script_chars_re, sense) + ): par = None nonfirst_par = False if par is None: @@ -552,16 +635,17 @@ def english_repl(m): m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1) if m: par = m.group(1) - item1 = item1[m.end():] + item1 = item1[m.end() :] else: # Try to find a parenthesized part at the end or from the # middle. - m = re.search(r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" - r"(\.$)?", - item1) + m = re.search( + r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?", + item1, + ) if m: par = m.group(1) - item1 = item1[:m.start()] + item1[m.end():] + item1 = item1[: m.start()] + item1[m.end() :] nonfirst_par = True if not par: break @@ -588,7 +672,7 @@ def english_repl(m): qualifier = par[:idx] else: break - par = par[idx + 1:].strip() + par = par[idx + 1 :].strip() # Check for certain comma-separated tags combined # with English text at the beginning or end of a @@ -676,19 +760,22 @@ def english_repl(m): d = pos_datas[idx] gl = "; ".join(d.get("glosses", ())) if not gl: - wxr.wtp.debug("parenthesized number " - "but the referenced sense has no " - "gloss: {}".format(par), - sortid="linkages/665") + wxr.wtp.debug( + "parenthesized number " + "but the referenced sense has no " + "gloss: {}".format(par), + sortid="linkages/665", + ) elif sense: sense += "; " + gl else: sense = gl else: - wxr.wtp.debug("parenthesized number but there is " - "no sense with such index: {}" - .format(par), - sortid="linkages/674") + wxr.wtp.debug( + "parenthesized number but there is " + "no sense with such index: {}".format(par), + sortid="linkages/674", + ) else: if alt: alt += "; " + par @@ -706,8 +793,8 @@ def english_repl(m): # Remove certain prefixes from linkages m = re.match(linkage_remove_prefixes_re, item1) if m: - prefix = item1[:m.end()] - item1 = item1[m.end():] + prefix = item1[: m.end()] + item1 = item1[m.end() :] if prefix in linkage_remove_prefixes_tags: if qualifier: qualifier += ", " + linkage_remove_prefixes_tags[prefix] @@ -720,13 +807,13 @@ def english_repl(m): # Remove certain suffixes from linkages m = re.search(linkage_remove_suffixes_re, item1) if m: - item1 = item1[:m.start()] + item1 = item1[: m.start()] # Parse linkages with "value = english" syntax (e.g., # väittää/Finnish) idx = item1.find(" = ") if idx >= 0: - eng = item1[idx + 3:] + eng = item1[idx + 3 :] if classify_desc(eng, no_unknown_starts=True) == "english": english = eng item1 = item1[:idx] @@ -736,25 +823,25 @@ def english_repl(m): eng = item1[:idx] if classify_desc(eng, no_unknown_starts=True) == "english": english = eng - item1 = item1[idx + 3:] + item1 = item1[idx + 3 :] # Parse linkages with "value - english" syntax (e.g., # man/Faroese) m = re.search(r" [-‐‑‒–—―] ", item1) if m and "(" not in item1: - suffix = item1[m.end():] + suffix = item1[m.end() :] cls = classify_desc(suffix, no_unknown_starts=True) if cls == "english": # This case intentionally ignores old values from english # (otherwise taxonomic lists fail) english = suffix - item1 = item1[:m.start()] + item1 = item1[: m.start()] elif cls == "tags": if qualifier: qualifier += ", " + suffix else: qualifier = suffix - item1 = item1[:m.start()] + item1 = item1[: m.start()] # Parse certain tags at the end of the linked term (unless # we are in a letters list) @@ -768,7 +855,7 @@ def english_repl(m): m = re.search(linkage_truncate_re, item1) if m: # suffix = item1[m.start():] # Currently ignored - item1 = item1[:m.start()] + item1 = item1[: m.start()] if not item1: continue # Ignore empty link targets if item1 == word: @@ -794,9 +881,11 @@ def add(w, r): # split as this is used when we have a different number # of romanizations than written forms, and don't know # which is which. - if ((not w or "," not in w) and - (not r or "," not in r) and - not wxr.wtp.page_exists(w)): + if ( + (not w or "," not in w) + and (not r or "," not in r) + and not wxr.wtp.page_exists(w) + ): lst = w.split("/") if len(w) > 1 else [w] if len(lst) == 1: lst = w.split(" / ") @@ -811,9 +900,15 @@ def add(w, r): # Heuristically remove "." at the end of most linkages # (some linkage lists end in a period, but we also have # abbreviations that end with a period that should be kept) - if (w.endswith(".") and not wxr.wtp.page_exists(w) and - (wxr.wtp.page_exists(w[:-1]) or - (len(w) >= 5) and "." not in w[:-1])): + if ( + w.endswith(".") + and not wxr.wtp.page_exists(w) + and ( + wxr.wtp.page_exists(w[:-1]) + or (len(w) >= 5) + and "." not in w[:-1] + ) + ): w = w[:-1] # If we have roman but not alt and the word is ASCII, @@ -847,8 +942,9 @@ def add(w, r): if alt and alt.strip() != w: dt["alt"] = alt.strip() if urls: - dt["urls"] = [url.strip() for url in urls - if url and isinstance(url, str)] + dt["urls"] = [ + url.strip() for url in urls if url and isinstance(url, str) + ] dt["word"] = w for old in data.get(field, ()): if dt == old: @@ -870,9 +966,11 @@ def add(w, r): # print("lang={} v={} script_chars={} item1={!r}" # .format(wxr.wtp.section, v, script_chars, item1)) if v and script_chars: - if (len(item1.split()) > 1 or - len(list(re.finditer(unicode_dc_re, item1))) == 2 or - (len(subitems) > 10 and v in ("Hiragana", "Katakana"))): + if ( + len(item1.split()) > 1 + or len(list(re.finditer(unicode_dc_re, item1))) == 2 + or (len(subitems) > 10 and v in ("Hiragana", "Katakana")) + ): if v == qualifier: # if sense: # sense += "; " + qualifier @@ -881,9 +979,12 @@ def add(w, r): qualifier = None if re.search(r" (letters|digits|script)$", v): qualifier = v # Also parse as qualifier - elif re.search(r"Variations of letter |" - r"Letters using |" - r"Letters of the ", v): + elif re.search( + r"Variations of letter |" + r"Letters using |" + r"Letters of the ", + v, + ): qualifier = "letter" parts = item1.split(". ") extra = () @@ -892,23 +993,28 @@ def add(w, r): item1 = parts[0] # Handle multi-character names for chars in language's # alphabet, e.g., "Ny ny" in P/Hungarian. - if (len(subitems) > 20 and len(item1.split()) == 2 and - all(len(x) <= 3 for x in item1.split())): - parts = list(m.group(0) for m in - re.finditer(r"(\w[\u0300-\u036f]?)+|.", - item1) - if not m.group(0).isspace() and - m.group(0) not in ("(", ")")) + if ( + len(subitems) > 20 + and len(item1.split()) == 2 + and all(len(x) <= 3 for x in item1.split()) + ): + parts = list( + m.group(0) + for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1) + if not m.group(0).isspace() + and m.group(0) not in ("(", ")") + ) else: - parts = list(m.group(0) for m in - re.finditer(r".[\u0300-\u036f]?", - item1) - if not m.group(0).isspace() and - m.group(0) not in ("(", ")")) + parts = list( + m.group(0) + for m in re.finditer(r".[\u0300-\u036f]?", item1) + if not m.group(0).isspace() + and m.group(0) not in ("(", ")") + ) for e in extra: idx = e.find(":") if idx >= 0: - e = e[idx + 1:].strip() + e = e[idx + 1 :].strip() if e.endswith("."): e = e[:-1] parts.extend(e.split()) @@ -920,10 +1026,11 @@ def add(w, r): rparts = None if roman: - rparts = list(m.group(0) for m in - re.finditer(r".[\u0300-\u036f]", - roman) - if not m.group(0).isspace()) + rparts = list( + m.group(0) + for m in re.finditer(r".[\u0300-\u036f]", roman) + if not m.group(0).isspace() + ) if len(rparts) != len(parts): rparts = None if not rparts: diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index bf3a7733a..f39e197a7 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -8,7 +8,18 @@ from typing import Any, Callable, Optional, Union from mediawiki_langcodes import get_all_names, name_to_code -from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor import ( + NodeKind, + WikiNode, +) +from wikitextprocessor.core import ( + TemplateArgs, + TemplateFnCallable, + PostTemplateFnCallable, +) +from wikitextprocessor.parser import ( + GeneralNode, +) from wiktextract.wxr_context import WiktextractContext @@ -56,9 +67,9 @@ def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool: def recursively_extract( - contents: Union[WikiNode, list[WikiNode]], + contents: Union[WikiNode, str, list[Union[str, WikiNode]]], fn: Callable[[Union[WikiNode, list[WikiNode]]], bool], -) -> tuple[list[WikiNode], list[WikiNode]]: +) -> tuple[list[Union[str, WikiNode]], list[Union[str, WikiNode]]]: """Recursively extracts elements from contents for which ``fn`` returns True. This returns two lists, the extracted elements and the remaining content (with the extracted elements removed at each level). Only @@ -311,9 +322,9 @@ def remove_duplicate_data(page_data: dict) -> None: def clean_node( wxr: WiktextractContext, sense_data: Optional[Any], - wikinode: Union[str, WikiNode, list[Union[str, WikiNode]]], - template_fn: Optional[Callable[[str, dict], str]] = None, - post_template_fn: Optional[Callable[[str, dict, str], str]] = None, + wikinode: GeneralNode, + template_fn: Optional[TemplateFnCallable] = None, + post_template_fn: Optional[PostTemplateFnCallable] = None, collect_links: bool = False, ) -> str: """ diff --git a/src/wiktextract/type_utils.py b/src/wiktextract/type_utils.py index ea2bfa3de..81a26f911 100644 --- a/src/wiktextract/type_utils.py +++ b/src/wiktextract/type_utils.py @@ -1,7 +1,6 @@ from typing import ( Sequence, TypedDict, - Union, ) From 441529d530ffc7c325385061582e961abd44b99c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Mon, 29 Jan 2024 10:37:13 +0200 Subject: [PATCH 7/8] Remove parse_linkage_template This code has been unreachable for a couple of years, with an XXX comment about removing it; it's pretty hard to figure out if the changes Tatu was meaning to make actually were made, but seeing as how things have been running smoothly even when parse_linkage_template has been unreachable all this time, I opted to just remove the unused code. I've also commented out some data used by parse_linkage_template in linkage_template_mappings (a list of lists that should have been a dict?), and moved the linkage template names into a new template_linkages set to be used in the one place that still had a reference to it. --- src/wiktextract/extractor/en/page.py | 91 ++++++++++------------------ 1 file changed, 33 insertions(+), 58 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index f7286c46d..3ebd91c6b 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -639,19 +639,38 @@ # Template name component to linkage section listing. Integer section means # default section, starting at that argument. -template_linkage_mappings: list[list[Union[str, int]]] = [ - ["syn", "synonyms"], - ["synonyms", "synonyms"], - ["ant", "antonyms"], - ["antonyms", "antonyms"], - ["hyp", "hyponyms"], - ["hyponyms", "hyponyms"], - ["der", "derived"], - ["derived terms", "derived"], - ["coordinate terms", "coordinate_terms"], - ["rel", "related"], - ["col", 2], -] +# XXX not used anymore, except for the first elements: moved to +# template_linkages +# template_linkage_mappings: list[list[Union[str, int]]] = [ +# ["syn", "synonyms"], +# ["synonyms", "synonyms"], +# ["ant", "antonyms"], +# ["antonyms", "antonyms"], +# ["hyp", "hyponyms"], +# ["hyponyms", "hyponyms"], +# ["der", "derived"], +# ["derived terms", "derived"], +# ["coordinate terms", "coordinate_terms"], +# ["rel", "related"], +# ["col", 2], +# ] + +# Template names, this was exctracted from template_linkage_mappings, +# because the code using template_linkage_mappings was actually not used +# (but not removed). +template_linkages: set[str] = { + "syn", + "synonyms", + "ant", + "antonyms", + "hyp", + "hyponyms", + "der", + "derived terms", + "coordinate terms", + "rel", + "col", +} # Maps template name used in a word sense to a linkage field that it adds. sense_linkage_templates: dict[str, str] = { @@ -2293,50 +2312,6 @@ def item_recurse( urls, ) - def parse_linkage_template(node): - nonlocal have_panel_template - # XXX remove this function but check how to handle the - # template_linkage_mappings - # print("LINKAGE TEMPLATE:", node) - - def linkage_template_fn(name, ht): - # print("LINKAGE_TEMPLATE_FN:", name, ht) - nonlocal field - nonlocal have_panel_template - if is_panel_template(wxr, name): - have_panel_template = True - return "" - for prefix, t in template_linkage_mappings: - if re.search( - r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name - ): - f = t if isinstance(t, str) else field - if ( - name.endswith("-top") - or name.endswith("-bottom") - or name.endswith("-mid") - ): - field = f - return "" - i = t if isinstance(t, int) else 2 - while True: - v = ht.get(i, None) - if v is None: - break - v = clean_node(wxr, None, v) - parse_linkage_item(v, f) - i += 1 - return "" - # print("UNHANDLED LINKAGE TEMPLATE:", name, ht) - return None - - # Main body of parse_linkage_template() - text = wxr.wtp.node_to_wikitext(node) - parsed = wxr.wtp.parse( - text, expand_all=True, template_fn=linkage_template_fn - ) - parse_linkage_recurse(parsed.children, field, None) - def parse_linkage_recurse(contents, field, sense): assert isinstance(contents, (list, tuple)) assert sense is None or isinstance(sense, str) @@ -3393,7 +3368,7 @@ def usex_template_fn(name, ht): usex_type = "example" elif name in quotation_templates: usex_type = "quotation" - for prefix, t in template_linkage_mappings: + for prefix in template_linkages: if re.search( r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name ): From 3e13e0e2bf84579cd6baced2f2a91d4c1610e9a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Mon, 29 Jan 2024 12:25:03 +0200 Subject: [PATCH 8/8] Fix: head_post_template_fn should return Optional[str] --- src/wiktextract/extractor/en/page.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 3ebd91c6b..23156cb44 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -942,7 +942,7 @@ def select_data() -> WordData: def head_post_template_fn( name: str, ht: TemplateArgs, expansion: str - ) -> str: + ) -> Optional[str]: """Handles special templates in the head section of a word. Head section is the text after part-of-speech subtitle and before word sense list. Typically it generates the bold line for the word, but @@ -974,7 +974,7 @@ def head_post_template_fn( # Note: various places expect to have content from wikipedia # templates, so cannot convert this to empty parse_wikipedia_template(wxr, pos_data, ht) - return "" + return None if name == "number box": # XXX extract numeric value? @@ -1002,7 +1002,7 @@ def head_post_template_fn( # XXX extract? return "" - return "" + return None def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: """Parses the subsection for a part-of-speech under a language on