diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 1680902aa..801df8bf6 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -96,9 +96,9 @@ def __init__( self.expand_tables = expand_tables # Some fields for statistics self.num_pages = 0 - self.language_counts = collections.defaultdict(int) - self.pos_counts = collections.defaultdict(int) - self.section_counts = collections.defaultdict(int) + self.language_counts: dict[str, int] = collections.defaultdict(int) + self.pos_counts: dict[str, int] = collections.defaultdict(int) + self.section_counts: dict[str, int] = collections.defaultdict(int) # Some fields related to errors # The word currently being processed. self.word = None @@ -114,14 +114,18 @@ def __init__( self.load_edition_settings() def merge_return(self, ret: CollatedErrorReturnData): - if "num_pages" in ret: - self.num_pages += ret["num_pages"] - for k, v in ret["language_counts"].items(): - self.language_counts[k] += v - for k, v in ret["pos_counts"].items(): - self.pos_counts[k] += v - for k, v in ret["section_counts"].items(): - self.section_counts[k] += v + # XXX This was never properly implemented; even the only + # count (self.section_counts) that is updated during running + # gets discarded when doing batches instead of individual + # pages. Search: STATISTICS_IMPLEMENTATION + # if "num_pages" in ret: + # self.num_pages += ret["num_pages"] + # for k, v in ret["language_counts"].items(): + # self.language_counts[k] += v + # for k, v in ret["pos_counts"].items(): + # self.pos_counts[k] += v + # for k, v in ret["section_counts"].items(): + # self.section_counts[k] += v if "errors" in ret: self.errors.extend(ret.get("errors", [])) self.warnings.extend(ret.get("warnings", [])) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index a57cc191c..766141cb2 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -2914,6 +2914,8 @@ def skip_template_fn(name, ht): t = clean_node(wxr, etym_data, node.sarg if node.sarg else node.largs) t = t.lower() + # XXX these counts were never implemented fully, and even this + # gets discarded: Search STATISTICS_IMPLEMENTATION wxr.config.section_counts[t] += 1 # print("PROCESS_CHILDREN: T:", repr(t)) if t.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])): diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 6a896d178..9ce9cf9f7 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -212,12 +212,14 @@ def main(): default=False, help="Capture descendants", ) - parser.add_argument( - "--statistics", - action="store_true", - default=False, - help="Print statistics", - ) + # XXX Was never implemented fully. + # Search: STATISTICS_IMPLEMENTATION + # parser.add_argument( + # "--statistics", + # action="store_true", + # default=False, + # help="Print statistics", + # ) parser.add_argument( "--page", type=str, @@ -521,6 +523,7 @@ def main(): pass os.rename(out_tmp_path, out_path) + # XXX was never implemented fully. Search for: STATISTICS_IMPLEMENTATION if args.statistics: print("") print("LANGUAGE COUNTS")