-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgscrawler.py
137 lines (112 loc) · 4.09 KB
/
gscrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import csv
import re
import time
import requests
from fp.errors import FreeProxyException
from fp.fp import FreeProxy
from scholarly import ProxyGenerator
from scholarly import scholarly as S
class MyFreeProxy(FreeProxy):
def __init__(self, country_id=None, timeout=0.5, rand=False, anonym=False, elite=False, google=None, https=False):
self.country_id = country_id
self.timeout = timeout
self.random = rand
self.anonym = anonym
self.elite = elite
self.google = google
self.schema = 'https' if https else 'http'
def get_proxy_list(self, repeat):
try:
page = requests.get(self.__website(repeat))
except requests.exceptions.RequestException as e:
raise FreeProxyException(
f'Request to {self.__website(repeat)} failed') from e
try:
proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', page.text)
proxies = [':'.join(proxy) for proxy in proxies]
return proxies
except Exception as e:
raise FreeProxyException('Failed to get list of proxies') from e
def __website(self, repeat):
return "https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1"
S.set_logger(True)
def _set_new_proxy():
pg = ProxyGenerator()
while True:
proxy = MyFreeProxy(rand=True, timeout=1).get()
success = pg.SingleProxy(http=proxy, https=proxy)
if success:
break
print("Working proxy:", proxy)
return proxy
def crawl_abstracts(keyword, year_low=None, year_high=None, outfile=None, max_iter=1000):
_set_new_proxy()
while True:
try:
search_query = S.search_pubs(keyword, year_low=year_low, year_high=year_high)
time.sleep(5)
break
except Exception as e:
print(e)
print("Trying new proxy on query base")
_set_new_proxy()
print("Crawling Started with keyword <" + keyword + ">.\n")
if not outfile:
outfile = "[Crwaling Results]" + keyword + ".csv"
with open(outfile, 'w') as o_file:
w = csv.writer(o_file)
header = "Index, Year, Author, Title, abstract\n"
o_file.write(header)
idx = 0
for _ in range(max_iter):
while True:
try:
time.sleep(5)
article = next(search_query)['bib']
break
except Exception as e:
print(e)
print("Trying new proxy on article read")
_set_new_proxy()
try:
title = article["title"]
except KeyError:
print("Error on Title Info")
continue
try:
abstract = article["abstract"]
if abstract.startswith("Skip to main content"):
continue
except KeyError:
print("Error on Abstract")
continue
try:
year = article["pub_year"]
except KeyError:
print("Error on year")
continue
try:
author = article["author"]
except KeyError:
print("Error on author")
continue
idx += 1
citation_form = ','.join(author) + '. "' + title + '." ' + year
print(f"\n{idx}. {citation_form}")
w.writerow([str(idx), year, ','.join(author), title, abstract])
print("\n\nProcess Done!")
print(f"{idx} articles are crawled.")
print(f"Results are saved in <{outfile}>.")
if __name__ == "__main__":
keywords = "source:management source:Communication source:Quarterly" # The keywords for searching. This is an example for searching papers on the journal "Management Communication Quarterly".
year_low = 2012
year_high = None
outfile = None
max_iter = 1000
crawl_abstracts(
keywords,
year_low=year_low,
year_high=year_high,
outfile=outfile,
max_iter=max_iter
)