From 7c835658145ab7c8c13f75d6a25d41d92c3c7065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Thu, 21 Dec 2023 14:35:56 +0200 Subject: [PATCH] --statistics never implemented, disabled --statistics command line parameter was never implemented. There are bits of logic and code left over from the previous attempt, like in merge_return that assumes somewhere certain statistics data has been collected to be handled in the central process, but that statistics collection has never actually been done except for one place: wxr.config.section_counts gets incremented in en/page.py, BUT even that data is then discarded if multiprocessing is used and merging is required (because there is no data sent with the merge). Using --statistics doesn't actually print any statistics, except if you are processing one page (or one process): then you get section_counts. What is currently typed as CollagedErrorReturnData is assumed to be actually a larger collection that includes these count statistics. I've commented out the code related to all of this; nothing should break, because nothing actually touches this. To make it easier to find all these random bits, I've commented them with the term `STATISTICS_IMPLEMENTATION`. --- src/wiktextract/config.py | 26 +++++++++++++++----------- src/wiktextract/extractor/en/page.py | 2 ++ src/wiktextract/wiktwords.py | 15 +++++++++------ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 1680902aa..801df8bf6 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -96,9 +96,9 @@ def __init__( self.expand_tables = expand_tables # Some fields for statistics self.num_pages = 0 - self.language_counts = collections.defaultdict(int) - self.pos_counts = collections.defaultdict(int) - self.section_counts = collections.defaultdict(int) + self.language_counts: dict[str, int] = collections.defaultdict(int) + self.pos_counts: dict[str, int] = collections.defaultdict(int) + self.section_counts: dict[str, int] = collections.defaultdict(int) # Some fields related to errors # The word currently being processed. self.word = None @@ -114,14 +114,18 @@ def __init__( self.load_edition_settings() def merge_return(self, ret: CollatedErrorReturnData): - if "num_pages" in ret: - self.num_pages += ret["num_pages"] - for k, v in ret["language_counts"].items(): - self.language_counts[k] += v - for k, v in ret["pos_counts"].items(): - self.pos_counts[k] += v - for k, v in ret["section_counts"].items(): - self.section_counts[k] += v + # XXX This was never properly implemented; even the only + # count (self.section_counts) that is updated during running + # gets discarded when doing batches instead of individual + # pages. Search: STATISTICS_IMPLEMENTATION + # if "num_pages" in ret: + # self.num_pages += ret["num_pages"] + # for k, v in ret["language_counts"].items(): + # self.language_counts[k] += v + # for k, v in ret["pos_counts"].items(): + # self.pos_counts[k] += v + # for k, v in ret["section_counts"].items(): + # self.section_counts[k] += v if "errors" in ret: self.errors.extend(ret.get("errors", [])) self.warnings.extend(ret.get("warnings", [])) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index a57cc191c..766141cb2 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -2914,6 +2914,8 @@ def skip_template_fn(name, ht): t = clean_node(wxr, etym_data, node.sarg if node.sarg else node.largs) t = t.lower() + # XXX these counts were never implemented fully, and even this + # gets discarded: Search STATISTICS_IMPLEMENTATION wxr.config.section_counts[t] += 1 # print("PROCESS_CHILDREN: T:", repr(t)) if t.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])): diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py index 6a896d178..9ce9cf9f7 100755 --- a/src/wiktextract/wiktwords.py +++ b/src/wiktextract/wiktwords.py @@ -212,12 +212,14 @@ def main(): default=False, help="Capture descendants", ) - parser.add_argument( - "--statistics", - action="store_true", - default=False, - help="Print statistics", - ) + # XXX Was never implemented fully. + # Search: STATISTICS_IMPLEMENTATION + # parser.add_argument( + # "--statistics", + # action="store_true", + # default=False, + # help="Print statistics", + # ) parser.add_argument( "--page", type=str, @@ -521,6 +523,7 @@ def main(): pass os.rename(out_tmp_path, out_path) + # XXX was never implemented fully. Search for: STATISTICS_IMPLEMENTATION if args.statistics: print("") print("LANGUAGE COUNTS")