Skip to content

Commit

Permalink
Merge pull request #67 from noameron/feature/nvidia_workday_scraper
Browse files Browse the repository at this point in the history
feature/workday_scraper
  • Loading branch information
nicobrenner authored Nov 18, 2024
2 parents 6d37cab + 5247ec1 commit 3f54f05
Show file tree
Hide file tree
Showing 18 changed files with 258 additions and 43 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ Note: If you want to add another source of job listings, [go to this issue](http


## Updates

* Added new scraper for Workday, currently scraping NVIDIA, CROWDSTRIKE, RED HAT and SALESFORCE.
* The scraper currently scrapes for all countries on posts no older than a **week** back!

* Building in public:
* ❤️ If you want to contribute to this project and want to take a crack at writing tests for it, it would be amazing! 🤗 Here's a ticket to write a new test, and a walk-through of the current test code: [Request to create: Test displaying the resume text](https://github.com/nicobrenner/commandjobs/issues/48) 🙏🏼

Expand Down
4 changes: 3 additions & 1 deletion config/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ beautifulsoup4==4.9.3
requests==2.25.1
openai
python-dotenv
windows-curses; sys_platform == 'win32'
windows-curses; sys_platform == 'win32'
selenium==4.25.0
webdriver-manager==4.0.2
5 changes: 5 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ WORKDIR /commandjobs
# Install any needed packages specified in requirements.txt
RUN pip3 install --no-cache-dir -r config/requirements.txt

# Install required packages, including Chromium and ChromeDriver
RUN apt-get update && \
apt-get install -y wget unzip chromium chromium-driver && \
apt-get clean \

# Run menu.py when the container launches
CMD ["python3", "src/menu.py"]

6 changes: 5 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,24 @@ services:
build:
context: ..
dockerfile: docker/Dockerfile

# Set container name
container_name: commandjobs

# Set environment variables
environment:
- MENU_APP=src/menu.py
- PYTHONPATH=/commandjobs

# Mount entire project into docker container under /repo
volumes:
- ../:/commandjobs
- ./docker/docker-entrypoint.sh:/commandjobs/docker-entrypoint.sh

# Use host network mode (may require changes depending on Docker environment)
network_mode: host

tty: true # Allocate a pseudo-TTY
stdin_open: true # Keep STDIN open

working_dir: /commandjobs
Empty file added job_scraper/__init__.py
Empty file.
Empty file.
2 changes: 0 additions & 2 deletions src/hn_scraper.py → job_scraper/hacker_news/scraper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import requests
from bs4 import BeautifulSoup
import sqlite3
import os
from queue import Queue

# Define a new exception for interrupting scraping
class ScrapingInterrupt(Exception):
Expand Down
Empty file.
10 changes: 10 additions & 0 deletions job_scraper/scraper_selectors/workday_selectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from enum import StrEnum


class WorkDaySelectors(StrEnum):
JOB_LISTING_XPATH = '//li[@class="css-1q2dra3"]'
JOB_TITLE_XPATH = './/h3/a'
JOB_ID_XPATH = './/ul[@data-automation-id="subtitle"]/li'
POSTED_ON_XAPTH = './/dd[@class="css-129m7dg"][preceding-sibling::dt[contains(text(),"posted on")]]'
JOB_DESCRIPTION_XPATH = '//div[@data-automation-id="jobPostingDescription"]'
NEXT_PAGE_XPATH = "//button[@data-uxi-element-id='next']"
13 changes: 13 additions & 0 deletions job_scraper/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
def get_workday_company_urls() -> dict:
urls = {
'NVIDIA': 'https://nvidia.wd5.myworkdayjobs.com/NVIDIAExternalCareerSite?jobFamilyGroup=0c40f6bd1d8f10ae43ffaefd46dc7e78',
'SALESFORCE': 'https://salesforce.wd12.myworkdayjobs.com/en-US/External_Career_Site/details/Lead-Marketing-Cloud-Solution-Engineer_JR268932?jobFamilyGroup=14fa3452ec7c1011f90d0002a2100000',
'RED_HAT': 'https://redhat.wd5.myworkdayjobs.com/Jobs',
'CROWDSTRIKE': 'https://crowdstrike.wd5.myworkdayjobs.com/crowdstrikecareers'
}
return urls

def get_workday_post_time_range() -> list[str]:
return ['posted today', 'posted yesterday', 'posted 2 days ago', 'posted 3 days ago',
'posted 4 days ago', 'posted 5 days ago', 'posted 6 days ago', 'posted 7 days ago']

Empty file added job_scraper/waas/__init__.py
Empty file.
File renamed without changes.
Empty file added job_scraper/workday/__init__.py
Empty file.
129 changes: 129 additions & 0 deletions job_scraper/workday/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import sqlite3
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from job_scraper.scraper_selectors.workday_selectors import WorkDaySelectors
from job_scraper.utils import get_workday_post_time_range, get_workday_company_urls


class WorkdayScraper:
def __init__(self, db_path='job_listings.db', update_func=None, done_event=None, result_queue=None):
self.db_path = db_path
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.get_selenium_configs())
self.one_week_span_text = get_workday_post_time_range()
self.company_urls = get_workday_company_urls()
self.new_entries_count = 0
self.done_event = done_event
self.result_queue = result_queue
self.update_func = update_func
self.job_listings = []

@staticmethod
def get_selenium_configs() -> Options:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
return chrome_options

def save_to_database(self, original_text, original_html, source, external_id):
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("INSERT OR IGNORE INTO job_listings (original_text, original_html, source, external_id) VALUES (?, ?, ?, ?)",
(original_text, original_html, source, external_id))
conn.commit()
conn.close()
return c.rowcount > 0

def save_new_job_listing(self, job_description, job_description_html, job_url, job_id):
if not job_description:
return
if not job_description_html:
return
if not job_url:
return
if not job_id:
return
self.job_listings.append({
'original_text': job_description,
'original_html': job_description_html,
'source': job_url,
'external_id': job_id
})

def save_job_listings_to_db(self):
for job in self.job_listings:
inserted = self.save_to_database(
job['original_text'],
job['original_html'],
job['source'],
job['external_id']
)
if inserted:
self.new_entries_count += 1
if self.done_event:
self.result_queue.put(self.new_entries_count)
self.done_event.set()

def scrape(self):
self.update_func(f"Scraping Workday companies:\t{", ".join(self.company_urls.keys())}")

for company_name, company_url in self.company_urls.items():
self.driver.get(company_url)
wait = WebDriverWait(self.driver, 10)

posted_this_week = True
while posted_this_week:
try:
wait.until(EC.presence_of_element_located((By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)))
except TimeoutException:
self.update_func("Job Listing Element not found. Try again later")
break

job_elements = self.driver.find_elements(By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)
for job_element in job_elements:
try:
self.update_func(f"Scraping {company_name}: {self.driver.current_url}")
job_title_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_TITLE_XPATH)
job_id_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_ID_XPATH)
job_id = job_id_element.text
posted_on_element = job_element.find_element(By.XPATH, WorkDaySelectors.POSTED_ON_XAPTH)
posted_on = posted_on_element.text

if posted_on.lower() in self.one_week_span_text:
job_url = job_title_element.get_attribute('href')
job_title_element.click()
job_description_element = wait.until(
EC.presence_of_element_located((By.XPATH, WorkDaySelectors.JOB_DESCRIPTION_XPATH))
)
job_description = job_description_element.text
job_description_html = job_description_element.get_attribute("innerHTML")
self.save_new_job_listing(job_description, job_description_html, job_url, job_id)
else:
posted_this_week = False
break
except StaleElementReferenceException:
continue

if not posted_this_week:
break

try:
next_page_button = wait.until(
EC.element_to_be_clickable((By.XPATH, WorkDaySelectors.NEXT_PAGE_XPATH))
)
next_page_button.click()
except TimeoutException:
self.update_func("TimeoutException. Please try again later!")
break

self.save_job_listings_to_db()
self.update_func("Scraping completed for all companies.")
Empty file added src/__init__.py
Empty file.
Loading

0 comments on commit 3f54f05

Please sign in to comment.