Skip to content

Commit

Permalink
--statistics never implemented, disabled
Browse files Browse the repository at this point in the history
--statistics command line parameter was never implemented.

There are bits of logic and code left over from the previous
attempt, like in merge_return that assumes somewhere certain
statistics data has been collected to be handled in the
central process, but that statistics collection has never
actually been done except for one place: wxr.config.section_counts
gets incremented in en/page.py, BUT even that data is then
discarded if multiprocessing is used and merging is required
(because there is no data sent with the merge).

Using --statistics doesn't actually print any statistics, except
if you are processing one page (or one process): then you get
section_counts.

What is currently typed as CollagedErrorReturnData is assumed to be
actually a larger collection that includes these count statistics.

I've commented out the code related to all of this; nothing
should break, because nothing actually touches this.

To make it easier to find all these random bits, I've commented
them with the term `STATISTICS_IMPLEMENTATION`.
  • Loading branch information
kristian-clausal committed Dec 21, 2023
1 parent ed8804d commit 7c83565
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 17 deletions.
26 changes: 15 additions & 11 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ def __init__(
self.expand_tables = expand_tables
# Some fields for statistics
self.num_pages = 0
self.language_counts = collections.defaultdict(int)
self.pos_counts = collections.defaultdict(int)
self.section_counts = collections.defaultdict(int)
self.language_counts: dict[str, int] = collections.defaultdict(int)
self.pos_counts: dict[str, int] = collections.defaultdict(int)
self.section_counts: dict[str, int] = collections.defaultdict(int)
# Some fields related to errors
# The word currently being processed.
self.word = None
Expand All @@ -114,14 +114,18 @@ def __init__(
self.load_edition_settings()

def merge_return(self, ret: CollatedErrorReturnData):
if "num_pages" in ret:
self.num_pages += ret["num_pages"]
for k, v in ret["language_counts"].items():
self.language_counts[k] += v
for k, v in ret["pos_counts"].items():
self.pos_counts[k] += v
for k, v in ret["section_counts"].items():
self.section_counts[k] += v
# XXX This was never properly implemented; even the only
# count (self.section_counts) that is updated during running
# gets discarded when doing batches instead of individual
# pages. Search: STATISTICS_IMPLEMENTATION
# if "num_pages" in ret:
# self.num_pages += ret["num_pages"]
# for k, v in ret["language_counts"].items():
# self.language_counts[k] += v
# for k, v in ret["pos_counts"].items():
# self.pos_counts[k] += v
# for k, v in ret["section_counts"].items():
# self.section_counts[k] += v
if "errors" in ret:
self.errors.extend(ret.get("errors", []))
self.warnings.extend(ret.get("warnings", []))
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2914,6 +2914,8 @@ def skip_template_fn(name, ht):
t = clean_node(wxr, etym_data,
node.sarg if node.sarg else node.largs)
t = t.lower()
# XXX these counts were never implemented fully, and even this
# gets discarded: Search STATISTICS_IMPLEMENTATION
wxr.config.section_counts[t] += 1
# print("PROCESS_CHILDREN: T:", repr(t))
if t.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])):
Expand Down
15 changes: 9 additions & 6 deletions src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,14 @@ def main():
default=False,
help="Capture descendants",
)
parser.add_argument(
"--statistics",
action="store_true",
default=False,
help="Print statistics",
)
# XXX Was never implemented fully.
# Search: STATISTICS_IMPLEMENTATION
# parser.add_argument(
# "--statistics",
# action="store_true",
# default=False,
# help="Print statistics",
# )
parser.add_argument(
"--page",
type=str,
Expand Down Expand Up @@ -521,6 +523,7 @@ def main():
pass
os.rename(out_tmp_path, out_path)

# XXX was never implemented fully. Search for: STATISTICS_IMPLEMENTATION
if args.statistics:
print("")
print("LANGUAGE COUNTS")
Expand Down

0 comments on commit 7c83565

Please sign in to comment.