-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
36 lines (32 loc) · 1.19 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import sys, getopt
from dataCollector.googleScholar import GoogleScholar
# This file is the controlls for scraper
def main(argv):
input_file = ''
output_file = ''
threads_number = 1
tor_enabled = False
try:
opts, args = getopt.getopt(argv, "hti:o:n:", ["input=", "tor", "output=", "number="])
except getopt.GetoptError:
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print("use --input, --output, --tor and --number")
sys.exit()
elif opt in ("-i", "--input"):
input_file = arg
elif opt in ("-o", "--output"):
output_file = arg
elif opt in ("-n", "--number"):
threads_number = int(arg)
elif opt in ("-t", "--tor"):
tor_enabled = True
google = GoogleScholar(is_thor_enabled=tor_enabled)
google.search_from_file(input_file=input_file, num_threads=threads_number)
google.list_citations.to_csv(output_file, index=False, header=False)
if __name__ == "__main__":
if len(sys.argv) > 1:
main(sys.argv[1:])
else:
print("no input") # this is used by the ui, as it reads this file as another class. preventing code duplication