-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcut_tree.py
executable file
·166 lines (137 loc) · 6.04 KB
/
cut_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
import aiohttp
import logging
import re
ARXIV_ABS_URL_BASE = "https://arxiv.org/abs/"
ARXIV_PDF_URL_BASE = "https://arxiv.org/pdf/"
# poor man's XML parser; group the ID and text title separately
FIND_TITLE_REGEX = re.compile("<title>\[(\d*\.\d*)\](.*)</title>")
FIND_ID_REGEX = re.compile("\d{4}\.\d{5,6}") #lol, maybe one day we'd publish > 99999 articles per month
LOGGER_FORMAT = '%(asctime)s %(message)s'
logging.basicConfig(format=LOGGER_FORMAT, datefmt='[%H:%M:%S]')
log = logging.getLogger()
log.setLevel(logging.INFO)
async def get_pdf_file(session, aid):
url = ARXIV_PDF_URL_BASE + aid
async with session.get(url) as resp:
log.debug("requesting PDF from {}".format(url))
resp.raise_for_status()
return await resp.read()
async def get_pdf_name(session, aid):
url = ARXIV_ABS_URL_BASE + aid
async with session.get(url) as resp:
log.debug("requesting abstract from {}".format(url))
resp.raise_for_status()
xml = await resp.text()
match = re.search(FIND_TITLE_REGEX, xml)
return str(match.group(1)), str(match.group(2).strip())
def get_article_id(line):
match = re.search(FIND_ID_REGEX, line)
if match:
return match.group(0)
else:
raise ValueError("invalid input {}".format(line))
async def process_one_article(session, article_uri):
aid = get_article_id(article_uri)
log.info("processing article {}".format(aid))
parsed_aid, text_title = await get_pdf_name(session, aid)
assert aid == parsed_aid, "id {} obtained from input does not match id {} from server".format(aid, parsed_aid)
pdf_file = await get_pdf_file(session, aid)
return pdf_file, text_title
async def main(article_list, postproc_fn=None):
async with aiohttp.ClientSession() as session:
for (aidx, task_result) in enumerate(asyncio.as_completed([process_one_article(session, a) for a in article_list])):
pdf_file, title = await task_result
log.info("Got file for article {}: {}".format(article_list[aidx], title))
if postproc_fn:
await postproc_fn(title, pdf_file)
if __name__ == "__main__":
import argparse
import asyncio
import os
parser = argparse.ArgumentParser(description=
"Download articles from arXiv given the article id, optionally print and/or save the PDF")
parser.add_argument("--save", type=str,default=None, help="location on disk to save the downloaded PDF;\
if unspecified, program does not write to disk")
parser.add_argument("--print", action='store_true', help="print with the default CUPS printer;\
does not print if unspecified")
parser.add_argument("--input", type=str, default=None, help="input specifying what article(s) to download.\
Can be either a comma separated strings containing the article id(s) like YYMM.abcde,YYMM.xxxxx\
or a file whose lines contain the article id")
args = parser.parse_args()
print_options = {}
temp_dir = None
def generate_article_list(input_str):
if not input_str:
raise ValueError("must supply input")
ret = [token.strip() for token in input_str.split(',')]
if (len(ret) == 0):
raise ValueError("--input invalid; got {}".format(ret))
elif (len(ret) == 1 and os.path.isfile(ret[0])):
with open(ret,'r') as f:
ret = [line.rstrip('\n') for line in f]
return ret
def setup_printing(should_print):
if not should_print:
log.info("not printing")
return None
ret = None
try:
import cups
conn = cups.Connection()
printers = conn.getPrinters()
if not printers:
raise RuntimeError("you don't have any printers setup with cups")
name = conn.getDefault()
loc = printers[name].get("printer-location", "unknown")
logging.info("using printer {} at location {}".format(name, loc))
def print_one_file(filepath, job_title, print_opt):
job_id = conn.printFile(name, filepath, job_title, print_opt)
log.info("submitting job {} ({}) to printer".format(job_id, job_title))
ret = print_one_file
except ImportError as _:
log.info("could not import pycups; make sure you have it installed with `pip install pycups")
except Exception as e:
print(e)
log.info("not printing")
finally:
return ret
def setup_saving(save_dir, print_fn):
if not save_dir and not print_fn:
log.info("not saving")
return None
import aiofiles, os
if save_dir:
# setup saving
if not os.path.exists(save_dir):
os.mkdir(save_dir)
elif print_fn:
# not saving but printing, so need tempfile
import tempfile
save_dir = tempfile.mkdtemp()
global temp_dir
temp_dir = save_dir
log.info("saving to {}".format(save_dir))
async def save_file_fn(filename, bin_content):
final_path = os.path.join(save_dir, filename)
log.info("saving {}".format(final_path))
async with aiofiles.open(final_path, 'wb') as pdf_file:
await pdf_file.write(bin_content)
return final_path
return save_file_fn
try:
print_fn = setup_printing(args.print)
save_fn = setup_saving(args.save, print_fn)
async def postprocess_fn(title, pdf_content):
if save_fn:
filepath = await save_fn(title + ".pdf", pdf_content)
if print_fn:
print_fn(filepath, title, print_options)
articles = generate_article_list(args.input)
log.info("given {} as input".format(",".join(articles)))
asyncio.run(main(articles, postprocess_fn))
finally:
if temp_dir:
log.info("deleting temporary directory")
import shutil
shutil.rmtree(temp_dir)