-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscraper.py
106 lines (94 loc) · 3.56 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
An application to scrape amazon.jobs website.
Passing an URL, it will return all the job IDs and job titles found in that URL.
Then, the app will create the job URL for each job found and, finally, will
save all the info in a CSV file.
"""
import re
import sys
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from config import URL
from config import WEBDRIVER_PATH_MAC
from config import WEBDRIVER_PATH_WIN
class Scraper:
"""
This is the main class of the application
"""
@staticmethod
def create_browser(driver_path: str) -> webdriver.Chrome:
"""
Creating a selenium object to simulate a browser.
The headless tag creates an invisible browser.
"""
browser_options = Options()
browser_options.add_argument('--headless')
browser_options.add_argument('--no-sandbox')
browser = webdriver.Chrome(executable_path=driver_path,
options=browser_options)
print('Done Creating Browser')
return browser
@staticmethod
def get_job_and_location(soup: BeautifulSoup) -> list:
"""
Getting job name, ID and location.
Then, creating the URL of each job ID.
"""
positions_list = []
info_list = []
for info in soup.findAll('div', {'class': 'info first col-12 col-md-8'}):
positions_list.append(info)
# print(positions_list) # Debug
for elem in positions_list:
# Getting job title from the HTML
regex = (r'job-title">([^<]+)')
# Getting location from the HTML
regex2 = (r'location-and-id">([^|]+)')
# Getting job ID from the HTML
regex3 = (r'Job ID: ([^<]+)')
job_title = re.findall(regex, str(elem))
location = re.findall(regex2, str(elem))
job_id = re.findall(regex3, str(elem))
# Creating a list of dictionaries with job_title, location, ID and url
info_dict = {
'job_title': job_title[0],
'location': location[0],
'job_id': job_id[0],
'job_url': 'https://www.amazon.jobs/en/jobs/' + str(job_id[0])
}
# Appending every dictionary created to the list
info_list.append(info_dict)
# Creating the csv file
Scraper.result_to_csv(info_list)
# print(info_list) # Debug
return info_list
@staticmethod
def result_to_csv(lst: list) -> str:
"""
Saving the output of the scraper into a CSV file.
"""
keys = lst[0].keys()
with open('example_output.csv', 'w') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(lst)
return 'Csv created!'
@staticmethod
def get_chrome_drivers() -> str:
if sys.platform.startswith('linux'):
driver_path = WEBDRIVER_PATH_MAC
elif sys.platform.startswith('win'):
driver_path = WEBDRIVER_PATH_WIN
else:
driver_path = "OS not supported."
return driver_path
if __name__ == '__main__':
print('Scraping started')
BROWSER = Scraper.create_browser(driver_path=Scraper.get_chrome_drivers())
BROWSER.get(url=URL)
PAGE_HTML = BROWSER.page_source
SOUP = BeautifulSoup(PAGE_HTML, features="html.parser")
Scraper.get_job_and_location(SOUP)
print('Your amazon.jobs url has been scraped. Check your output CSV file.')