-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki.py
115 lines (97 loc) · 3.47 KB
/
wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import operator
import typing
from datetime import datetime
import requests
from tqdm import tqdm
from .caching import cache
from .logs import get_logger
from .paths import coppermind_cache_path
__all__ = ["coppermind_query", "extract_relevant_info"]
logger = get_logger("csn.utils.wiki")
@cache(coppermind_cache_path)
def coppermind_query() -> typing.List[dict]:
"""load data from coppermind.net"""
logger.info("Beginning query of coppermind.net.")
def batch_query():
"""function to query coppermind.net api in batches for all character pages"""
# query generator code based on https://www.mediawiki.org/wiki/API:Query#Continuing_queries
wiki_api = "https://coppermind.net/w/api.php"
# get total number of pages to fetch
r = requests.get(
wiki_api,
params=dict(
action="query",
format="json",
prop="categoryinfo",
titles="Category:Characters",
),
)
num_pages = (
r.json()
.get("query", {})
.get("pages", {})
.get("40", {})
.get("categoryinfo", {})
.get("pages")
)
# query server until finished
payload = {
"action": "query",
"format": "json",
"prop": "revisions",
"generator": "categorymembers",
"rvprop": "content|timestamp",
"rvslots": "main",
"rvsection": 0,
"gcmtitle": "Category:Characters",
"gcmprop": "ids|title",
"gcmtype": "page",
"gcmlimit": 50,
"formatversion": 2,
}
continue_data = {}
with tqdm(total=num_pages, unit=" pages") as progress_bar:
while continue_data is not None:
req = payload.copy()
req.update(continue_data)
r = requests.get(wiki_api, params=req)
response = r.json()
num_results = len(response.get("query", {}).get("pages", []))
logger.debug(
f"Batch of {num_results} results received from coppermind.net."
)
if "error" in response:
raise RuntimeError(response["error"])
if "warnings" in response:
print(response["warnings"])
if "query" in response:
yield response["query"].get("pages", [])
continue_data = response.get("continue", None)
progress_bar.update(num_results)
logger.info("Finished query of coppermind.net.")
return sorted(
(page for batch in batch_query() for page in batch),
key=operator.itemgetter("pageid"),
)
def extract_relevant_info(result: dict) -> dict:
"""flatten the relevant fields of an mediawiki api result into a single-level dictionary."""
# possible null values
page_id = result.get("pageid")
timestamp_str = result.get("revisions", [{}])[0].get("timestamp", "")
timestamp = (
datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
if timestamp_str
else None
)
content = (
result.get("revisions", [{}])[0]
.get("slots", {})
.get("main", {})
.get("content", "")
)
return {
"pageid": int(page_id) if page_id else None,
"title": result.get("title", ""),
"timestamp": timestamp,
"content": content,
}