forked from gucorpling/amalgum
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_fiction.py
227 lines (191 loc) · 6.13 KB
/
get_fiction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import io, re, sys, os
from glob import glob
from random import shuffle, seed, choice
from lib.utils import Document
from gutenberg.acquire import load_etext
from gutenberg.acquire.text import _format_download_uri
from gutenberg.cleanup import strip_headers
# Constants for limiting text sizes based on space count
MAX_SPACES = 1000
MIN_SPACES = 300
TARGET_SPACES = 800
TOTAL_GENRE_SIZE = 430000 # approx. ratio of 0.85 spaces to tokens, so just over 0.5 M tokens
def detect_hyphenation(text):
"""Heuristically detects texts with unrestored hyphenation"""
bad = ["sug- g","dis- t","ig- n","un- "]
if any([b in text for b in bad]):
return True
return False
def detect_non_fiction(text):
if "endnote" in text.lower():
return True
if "<head>contents" in text.lower():
return True
if "<head>table of contents" in text.lower():
return True
if "<head>list of chapters" in text.lower():
return True
if "<p>contents.</p>" in text.lower():
return True
if "<head>illustrations</head>" in text.lower():
return True
if "<head>acknowledgments" in text.lower():
return True
if " trans. by" in text.lower():
return True
# Archaic language:
if " doth " in text or " hath " in text or " thou art " in text:
return True
return False
def get_paragraphs(text):
"""Divides a string into a list of paragraphs based on multiple white space lines"""
text = re.sub('(^ +|\t)','',text.strip()) # Trim all leading space and tabs in lines
text = re.sub(' +','♡❤♡',text) # Protect spaces
paragraphs = re.split(r'\n\n+',text.strip())
output = []
for para in paragraphs:
para = para.replace("♡❤♡"," ")
#para = re.sub(r'\n',' ',para) # All lines end in space
para = re.sub(' +',' ',para)
output.append(para.strip())
return output
def detect_headings(paragraphs):
"""Classifies paragraphs into headings or not based on line count and other heuristics"""
output = []
for i, para in enumerate(paragraphs):
if i > 48:
a=3
if para.count("\n") > 0: # Multiline can't be heading
output.append(0)
continue
else:
if re.match(r'[a-z\[]',para[0]) is not None: # Lower case or figure, footnote can't be heading
output.append(0)
continue
elif "copyright" in para.lower():
output.append(0)
continue
elif para.startswith('"') or para.startswith("'") or para.startswith("‘") or para.startswith("“"):
# Possible direct speech line paragraph
output.append(0)
continue
elif re.search(r'[A-Za-z]',para) is None: # No text in paragraph
output.append(0)
continue
elif para[-1] in [")","(","]","[",".","?","!", ":", "-", "”", '"', "'","’"]: # Heading should not end in punctuation
output.append(0)
continue
else:
output.append(1)
return output
seed(42)
script_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep
data_dir = script_dir + "data" + os.sep
meta = io.open("gutenberg_meta_filtered.tab",encoding="utf8").read().strip().split("\n")
docnum = 0
total_spaces = 0
accepted = 0
for i, book in enumerate(meta):
if i+1 % 10 == 0:
sys.stderr.write("\rSeen " + str(i+1) + " books, accepted " + str(accepted) +
" with " + str(total_spaces)+ " spaces ")
book_id, title, author = book.split("\t")
try:
e_text = load_etext(int(book_id),mirror="http://gutenberg.readingroo.ms")
url = _format_download_uri(int(book_id),mirror="http://gutenberg.readingroo.ms")
except: # can't find URI, e.g. UnknownDownloadUriException
sys.stderr.write("WARN: could not download text ID:" + str(book_id) + "\n")
continue
text = strip_headers(e_text).strip().replace("\r","")
# Collapse multiline brackets (e.g. multiline figure captions)
collapsed = ""
lines = text.split("\n")
open_bracket = False
for line in lines:
if "[" in line and not "]" in line:
open_bracket = True
if open_bracket:
collapsed += line
else:
collapsed += line + "\n"
if "]" in line:
open_bracket = False
text = collapsed
paragraphs = get_paragraphs(text)
heading_idx = detect_headings(paragraphs)
hds = [i for i, h in enumerate(heading_idx) if h == 1]
hd_before_text = []
last = 1000000
for i in hds[::-1]:
if i != last - 1:
if i+1 < len(paragraphs)-1:
if len(paragraphs[i+1])>200: # Next paragraph must be somewhat long
hd_before_text.append(i)
if len(hd_before_text) == 0:
continue
start = choice(hd_before_text)
spaces = 0
par_num = start
output = []
for para in paragraphs[start:]:
spaces += para.count(" ")
if par_num in hds:
para = "<head>" + para + "</head>"
else:
para = "<p>" + para + "</p>"
para = para.replace("\n"," ")
para = re.sub(' +',' ', para)
para = re.sub(r'_([^_]+)_',r'<hi rend="italic">\1</hi>',para) # italic
# Footnotes
fn = re.search(r'\[footnote( [0-9]+)?:?([^\]]+)\]',para,flags=re.IGNORECASE)
if fn is not None:
if fn.group(1) is not None:
note_num = ' n="'+fn.group(1)+'"'
else:
note_num = ""
para = para.replace(fn.group(0),'<note'+note_num+'>'+fn.group(2)+'</note>')
# Figures
para = re.sub(r'\[(?:[Ii]llustration|[Pp]icture):?([^\]]*)\]',r'<figure><caption>\1</caption></figure>',para,flags=re.IGNORECASE)
para = para.replace("<figure><caption></caption></figure>","<figure/>")
par_num +=1
output.append(para)
if spaces > TARGET_SPACES:
break
if spaces < MIN_SPACES:
continue
# Remove alpha-less paragraphs in head/tail, ignoring XML:
cleaned = []
start = False
for para in output:
noxml = re.sub(r'<[^<>]+>','',para)
if re.search('[A-Za-z]',noxml) is None:
if not start:
continue
else:
start = True
cleaned.append(para)
output = cleaned
for i in list(range(len(cleaned)))[::-1]:
noxml = re.sub(r'<[^<>]+>','',output[i])
if re.search(r'[A-Za-z]',noxml) is None:
output.pop(i)
else:
break
doc = Document()
doc.url = url
doc.genre = "fiction"
doc.author, doc.title = author, title # os.path.basename(file_).replace(".txt","").split("___")
doc.text = "\n\n".join(output)
doc.docnum = docnum
if doc.text.count(" ") > MAX_SPACES:
continue
if detect_hyphenation(doc.text):
continue
if detect_non_fiction(doc.text):
continue
doc.serialize()
accepted += 1
total_spaces += doc.text.count(" ")
if total_spaces > TOTAL_GENRE_SIZE:
break
docnum+=1