-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdisambiguation.py
307 lines (257 loc) · 10.1 KB
/
disambiguation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
from typing import Optional, List
import colorama
import yaml
from tqdm import tqdm
from core.characters import Character, characters, lookup
from core.config import InteractionNetworkConfig
import utils.constants as constants
from utils.epub import tokenize_chapters
from utils.exceptions import AmbiguousReferenceError
from utils.paths import disambiguation_dir
from .input import ask, clear_screen, menu, yn_question
from .logs import get_logger
from .simpletypes import RunContext
__all__ = ["verify_presence", "check_position", "disambiguate_book"]
RUN_SIZE = InteractionNetworkConfig.run_size
PREV_LINES = InteractionNetworkConfig.prev_cxt_lines
NEXT_LINES = InteractionNetworkConfig.next_cxt_lines
colorama.init(autoreset=True)
logger = get_logger("csn.core.disambiguation")
_char_ids = {c.id: c for c in characters}
def _save(pos: int, char: Optional[Character], disambiguation: dict) -> None:
disambiguation[pos] = char.id if char is not None else None
def _recall(pos: int, disambiguation: dict) -> Optional[Character]:
return _char_ids.get(disambiguation.get(pos), None)
def char_search(prompt: Optional[str]) -> Optional[Character]:
response = ask(
prompt=prompt,
validator=lambda r: slice(r, None) in lookup,
error="Character not found.",
)
if response is None:
return None
elif response in lookup:
matches = lookup[response]
else:
matches = lookup[response:]
if len(matches) > 1:
for char in matches:
if char.name == response:
return char
print(f"Multiple matches found for {response}. Please specify:")
for c in matches:
print(f" {c.details}")
return char_search(prompt=None)
else:
return matches[0]
def clarify_list(
name: str, matches: list, context: RunContext, pos: int
) -> Optional[Character]:
response = menu(
prompt=f'Ambiguous reference found for "{name}"! Please choose the correct character.',
options=[f" {i + 1}: {c.details}" for i, c in enumerate(matches)]
+ [
f" o: The correct character is not listed.",
f" #: This is not a character.",
],
validator=lambda r: (r.isdigit() and int(r) <= len(matches))
or r.lower().startswith("#")
or r.lower().startswith("o"),
)
if response.startswith("#"):
return None
elif response.startswith("o"):
char = char_search(
"Type the name of the character the keyword is referring to: "
)
logger.debug(
f'Matched ambiguous reference from "{name}" at {context.chapter}:{pos}, '
f"identified manually as {repr(char)}."
)
return char
else:
char = matches[int(response) - 1]
logger.debug(
f'Matched ambiguous reference from "{name}" at {context.chapter}:{pos}, '
f"identified from list as {repr(char)}."
)
return char
def verify_presence(key: str, ch: Character, word: str) -> bool:
in_book = key.split("/")[0] in ch.books
if not in_book:
logger.debug(
f'Rejected ambiguous reference from "{word}", identified automatically as '
f"{repr(ch)} who does not appear in {key}."
)
return in_book
def disambiguate_name(
key: str, name: str, disambiguation: dict, pos: int, context: RunContext
) -> Optional[Character]:
if pos in disambiguation:
char = _recall(pos, disambiguation)
if char is not None:
logger.debug(
f"Matched ambiguous reference at {context.chapter}:{pos} using disambiguation, "
f"identified automatically as {repr(char)}."
)
else:
save = False
local = [c for c in lookup[name] if verify_presence(key, c, name)]
if len(local) == 1:
char = local[0]
logger.debug(
f'Matched ambiguous reference from "{name}" at {context.chapter}:{pos}, '
f"identified automatically as {repr(char)} with presence in {key}."
)
elif len(local) > 1:
save = True
if name == "Sadeas":
char = lookup["Torol Sadeas"][0]
elif name == "Roshone":
char = lookup["Roshone"][0]
else:
print(context.prev)
print(context.run)
print(context.next)
char = clarify_list(name, local, context, pos)
clear_screen()
else:
char = None
if save:
_save(pos, char, disambiguation)
return char
def disambiguate_title(
title: str, disambiguation: dict, pos: int, context: RunContext
) -> Optional[Character]:
if pos in disambiguation:
char = _recall(pos, disambiguation)
if char is not None:
logger.debug(
f"Matched ambiguous reference at {context.chapter}:{pos} using disambiguation, "
f"identified automatically as {repr(char)}."
)
else:
print(context.prev)
print(context.run)
print(context.next)
char = char_search(
f'Ambiguous reference found for "{title}". Who does this refer to?'
)
clear_screen()
if char is not None:
logger.debug(
f'Matched ambiguous reference from "{title}" at {context.chapter}:{pos}, '
f"identified manually as {repr(char)}."
)
_save(pos, char, disambiguation)
return char
def disambiguate_book(key: str):
disambiguation_path = (disambiguation_dir / key).with_suffix(".yml")
if not disambiguation_path.exists():
disambiguation_path.parent.mkdir(parents=True, exist_ok=True)
disambiguation_path.touch()
with disambiguation_path.open(mode="r") as f:
disambiguation = yaml.load(f, yaml.Loader)
if disambiguation is None:
disambiguation = {}
tokenized_chapters = list(tokenize_chapters(key))
for i, (chapter, tokens) in enumerate(tokenized_chapters):
print(f"=== {key}: {chapter} ({i + 1}/{len(tokenized_chapters)}) ===")
if chapter not in disambiguation:
disambiguation[chapter] = {}
idx = 0
while idx < len(tokens):
found = []
context = RunContext(
chapter=chapter,
# fmt: off
prev=[s for s in (' '.join(tokens[max(0, idx - (i * RUN_SIZE)):
max(0, idx - ((i - 1) * RUN_SIZE))]).strip()
for i in range(PREV_LINES, 0, -1)) if s],
run=tokens[idx:min(len(tokens), idx + RUN_SIZE)],
next=[s for s in (' '.join(tokens[min(len(tokens), idx + (i * RUN_SIZE)):
min(len(tokens), idx + ((i + 1) * RUN_SIZE))]).strip()
for i in range(1, NEXT_LINES + 1)) if s]
# fmt: on
)
i = 0
tokens_remaining = len(tokens) - idx
while i < min(RUN_SIZE, tokens_remaining):
pos = idx + i
this_token = tokens[pos]
next_token = tokens[pos + 1] if pos + 1 < len(tokens) else ""
third_token = tokens[pos + 2] if pos + 2 < len(tokens) else ""
two_tokens = this_token + " " + next_token
next_two_tokens = next_token + " " + third_token
three_tokens = two_tokens + " " + third_token
if three_tokens in lookup:
ctx = context.highlight(i, 3)
i += 3
if (
disambiguate_name(
key, three_tokens, disambiguation[chapter], pos, ctx
)
is not None
):
found.append(i)
elif two_tokens in lookup:
ctx = context.highlight(i, 2)
i += 2
if (
disambiguate_name(
key, two_tokens, disambiguation[chapter], pos, ctx
)
is not None
):
found.append(i)
elif this_token in constants.titles:
ctx = context.highlight(i, 1)
i += 1
if (
next_token not in lookup
and next_two_tokens not in lookup
and disambiguate_title(
this_token, disambiguation[chapter], pos, ctx
)
is not None
):
found.append(i)
elif this_token in lookup:
ctx = context.highlight(i, 1)
i += 1
if (
disambiguate_name(
key, this_token, disambiguation[chapter], pos, ctx
)
is not None
):
found.append(i)
else:
i += 1
continue
# advance past first found character
if len(found) >= 2:
delta = found[0] + 1
# advance until only character is at beginning of run (max threshold)
elif len(found) == 1:
delta = found[0] if found[0] > 0 else 1
# skip run if no chars found
else:
delta = RUN_SIZE - 1
idx += delta
with disambiguation_path.open(mode="w") as f:
yaml.dump(
disambiguation,
f,
yaml.Dumper,
default_flow_style=False,
sort_keys=False,
)
def check_position(disambiguation, pos):
if pos in disambiguation:
return _recall(pos, disambiguation)
else:
raise AmbiguousReferenceError(
f"Reference at position {pos} not found in disambiguation. "
f"Please run disambiguation again."
)