-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
253 lines (206 loc) · 10.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
from tqdm import tqdm
import re
import json
from utils import scrape_table,get_arxiv_id_v2,get_abstract_from_arxiv_url,download_arxiv_paper,sort_dict,save_dict2json
def translate_en2zh(model, tokenizer, abstract):
if model and tokenizer:
try:
input_ids = tokenizer.encode(abstract, return_tensors="pt")
outputs = model.generate(input_ids=input_ids, max_length=1024)
chinese_abstract = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f'中文:{chinese_abstract}')
return chinese_abstract
except:
print("An exception occurred")
return None
def process_table(dataframes,args,model,tokenizer,download_dir):
paper_num = 0
class_type_dict = dict()
from collections import Counter
title_key_word_c = Counter()
# 打印每个表格的数据,从ArXiv中获取论文ID
for i, df in enumerate(dataframes):
paper_num += df.shape[0] - 1
for idx in range(1, df.shape[0], 1):
print(f'Table: ({i}/{len(dataframes)}). row: ({idx}/{df.shape[0]}) \n')
class_type = df[0][idx].replace(":", ": ").replace("/", ",")
paper_title = df[1][idx].replace(":", ": ")
from string import digits
class_type = class_type.translate(str.maketrans('', '', digits)) # remove digits
if class_type in class_type_dict:
class_type_dict[class_type] += 1
else:
class_type_dict[class_type] = 1
title_key_word_c.update(re.split(':|:| |,', paper_title))
print(f'class_type: {class_type}. \npaper name: {paper_title}')
arxiv_file = open(f'{download_dir}{class_type.replace(": ", "_")}.txt', 'a', encoding='utf-8')
# 构建ArXiv搜索URL
arxiv_id, abstract = get_arxiv_id_v2(paper_title)
arxiv_file.write(f'\ntitle: {paper_title}\n')
chinese_title = translate_en2zh(model, tokenizer, paper_title)
if chinese_title is not None:
arxiv_file.write(f'中文名:{chinese_title}\n')
abs_url = f'http://arxiv.org/abs/{arxiv_id}'
pdf_url = f'http://arxiv.org/pdf/{arxiv_id}.pdf'
print(f'link:{abs_url}')
print(f'link:{pdf_url}')
arxiv_file.write(f'link:{abs_url}\n')
arxiv_file.write(f'link:{pdf_url}\n')
if arxiv_id is None:
find_err_file = open(f'{download_dir}find_err.txt', 'a', encoding='utf-8')
find_err_file.write(f'paper_title:{paper_title}\n')
find_err_file.close()
if abstract is None:
# 发送GET请求获取页面内容
abstract = get_abstract_from_arxiv_url(abs_url)
if args.download_paper is True and arxiv_id is not None:
if args.download_paper_list is None: # If None, download all
download_arxiv_paper(pdf_url, download_dir, class_type, paper_title)
else:
for download_ in args.download_paper_list:
lower_name = paper_title.replace(':', '_').replace(': ', '_').lower()
lower_type = class_type.replace(':', '_').replace(': ', '_').lower()
if download_ in lower_type or download_ in lower_name:
download_arxiv_paper(pdf_url, download_dir, class_type, paper_title)
break
if abstract:
abstract = abstract.split('\n')[0]
print(f'abstract:{abstract}')
arxiv_file.write(f'abstract:{abstract}\n')
# 翻译成中文
chinese_abstract = translate_en2zh(model, tokenizer, abstract)
if chinese_abstract is not None:
arxiv_file.write(f'摘要:{chinese_abstract}\n')
arxiv_file.close()
print("\n")
save_dict2json(sort_dict(class_type_dict),f'{download_dir}class_type.json')
save_dict2json(sort_dict(dict(title_key_word_c)),f'{download_dir}paper_title_vocabulary.json')
return paper_num
def download_iccv2023(args,model,tokenizer):
download_dir = './ICCV2023/'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
# 使用你的URL替换下面的URL
url = "https://iccv2023.thecvf.com/main.conference.program-107.php"
# 从网页中抓取表格
dataframes = scrape_table(url)
# 处理表格
paper_num = process_table(dataframes,args,model,tokenizer,download_dir)
print(f'iccv2023 paper total num: {paper_num}')
return paper_num
def download_from_openaccess_url(url,args,model,tokenizer,download_dir):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all('dt', class_='ptitle')
openaccess_url = "https://openaccess.thecvf.com/"
for idx, result in enumerate(results): # 对于每一个论文标题
print(f'\npaper idx:({idx}/{len(results)})')
temp = result.find_all('a')[0]
paper_title = temp.text.strip().replace(':', '_')
paper_link = openaccess_url + temp['href']
arxiv_file = open(f'{download_dir}paper_list_{idx//500}.txt', 'a', encoding='utf-8')
print(f'title:{paper_title}')
arxiv_file.write(f'\ntitle: {paper_title}\n')
chinese_title = translate_en2zh(model, tokenizer, paper_title)
if chinese_title is not None:
arxiv_file.write(f'中文名:{chinese_title}\n')
paper_response = requests.get(paper_link)
paper_soup = BeautifulSoup(paper_response.text, 'html.parser')
paper_content = paper_soup.find_all('div', id='content')[0]
print(f'link:{paper_link}')
arxiv_file.write(f'link:{paper_link}\n')
paper_links = paper_content.find_all('dd')[1].find_all('a')
pdf_url=None
supplemental_url = None
arxiv_url = None
for link in paper_links:
if link.text.strip() == 'pdf':
pdf_url = openaccess_url + link['href']
if link.text.strip() == 'supp':
supplemental_url = openaccess_url + link['href']
if link.text.strip() == 'arXiv':
arxiv_url = link['href']
if pdf_url is not None:
arxiv_file.write(f'openaccess pdf:{pdf_url}\n')
else:
if arxiv_url is not None:
pdf_url = arxiv_url.replace('abs','pdf')+'.pdf'
arxiv_file.write(f'arxiv pdf:{pdf_url}\n')
# if supplemental_url is not None:
# arxiv_file.write(f'supplemental:{supplemental_url}\n')
# if arxiv_url is not None:
# arxiv_file.write(f'arxiv:{arxiv_url}\n')
paper_authors = paper_content.find_all('div', id='authors')[0].text.strip()
print(f'author:{paper_authors}')
arxiv_file.write(f'author:{paper_authors}\n')
if args.download_paper is True:
if args.download_paper_list is None: # If None, download all
download_arxiv_paper(pdf_url, download_dir, None, paper_title)
else:
for download_ in args.download_paper_list:
lower_name = paper_title.replace(':', '_').replace(': ', '_').lower()
if download_ in lower_name:
download_arxiv_paper(pdf_url, download_dir, None, paper_title)
break
paper_abstract = paper_content.find_all('div', id='abstract')[0].text.strip()
print(f'abstract:{paper_abstract}')
arxiv_file.write(f'abstract:{paper_abstract}\n')
chinese_abstract = translate_en2zh(model, tokenizer, paper_abstract)
if chinese_abstract is not None:
arxiv_file.write(f'摘要:{chinese_abstract}\n')
arxiv_file.close()
return len(results)
def download_iccv2021(args,model,tokenizer):
download_dir = './ICCV2021/'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
url = "https://openaccess.thecvf.com/ICCV2021?day=all"
paper_num = download_from_openaccess_url(url,args,model,tokenizer,download_dir)
print(f'iccv2021 paper total num: {paper_num}')
def download_cvpr2023(args,model,tokenizer):
download_dir = './ICVPR2023/'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
url = "https://openaccess.thecvf.com/CVPR2023?day=all"
paper_num = download_from_openaccess_url(url,args,model,tokenizer,download_dir)
print(f'cvpr2023 paper total num: {paper_num}')
def download_cvpr2022(args,model,tokenizer):
download_dir = './ICVPR2022/'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
url = "https://openaccess.thecvf.com/CVPR2022?day=all"
paper_num = download_from_openaccess_url(url,args,model,tokenizer,download_dir)
print(f'cvpr2022 paper total num: {paper_num}')
def download_cvpr2021(args,model,tokenizer):
download_dir = './ICVPR2021/'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
url = "https://openaccess.thecvf.com/CVPR2021?day=all"
paper_num = download_from_openaccess_url(url,args,model,tokenizer,download_dir)
print(f'cvpr2021 paper total num: {paper_num}')
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--download_paper", type=bool, default=True, help="download paper")
parser.add_argument("--download_paper_list", type=list, default=['single image', 'single-image', 'novel view', 'novel-view'], help="Download the specified paper, which can refer to the specified category or specified content. If None, download all")
parser.add_argument("--trans_abstract", type=bool, default=True, help="translate paper")
args = parser.parse_args()
model_path = './opus-mt-en-zh/'
if args.trans_abstract and os.path.exists(model_path):
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
else:
model = None
tokenizer = None
paper_num = download_iccv2023(args,model,tokenizer)
paper_num += download_iccv2021(args,model,tokenizer)
paper_num += download_cvpr2023(args,model,tokenizer)
paper_num += download_cvpr2022(args,model,tokenizer)
paper_num += download_cvpr2021(args,model,tokenizer)
print(f'paper total num: {paper_num}')