Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/workday_scraper #67

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ Note: If you want to add another source of job listings, [go to this issue](http


## Updates

* Added new scraper for Workday, currently scraping NVIDIA, CROWDSTRIKE, RED HAT and SALESFORCE.
* The scraper currently scrapes for all countries on posts no older than a **week** back!

* Building in public:
* ❤️ If you want to contribute to this project and want to take a crack at writing tests for it, it would be amazing! 🤗 Here's a ticket to write a new test, and a walk-through of the current test code: [Request to create: Test displaying the resume text](https://github.com/nicobrenner/commandjobs/issues/48) 🙏🏼

Expand Down
4 changes: 3 additions & 1 deletion config/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ beautifulsoup4==4.9.3
requests==2.25.1
openai
python-dotenv
windows-curses; sys_platform == 'win32'
windows-curses; sys_platform == 'win32'
selenium==4.25.0
webdriver-manager==4.0.2
5 changes: 5 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ WORKDIR /commandjobs
# Install any needed packages specified in requirements.txt
RUN pip3 install --no-cache-dir -r config/requirements.txt

# Install required packages, including Chromium and ChromeDriver
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent :)

RUN apt-get update && \
apt-get install -y wget unzip chromium chromium-driver && \
apt-get clean \

# Run menu.py when the container launches
CMD ["python3", "src/menu.py"]

6 changes: 5 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,24 @@ services:
build:
context: ..
dockerfile: docker/Dockerfile

# Set container name
container_name: commandjobs

# Set environment variables
environment:
- MENU_APP=src/menu.py
- PYTHONPATH=/commandjobs

# Mount entire project into docker container under /repo
volumes:
- ../:/commandjobs
- ./docker/docker-entrypoint.sh:/commandjobs/docker-entrypoint.sh

# Use host network mode (may require changes depending on Docker environment)
network_mode: host

tty: true # Allocate a pseudo-TTY
stdin_open: true # Keep STDIN open

working_dir: /commandjobs
Empty file added job_scraper/__init__.py
Empty file.
Empty file.
2 changes: 0 additions & 2 deletions src/hn_scraper.py → job_scraper/hacker_news/scraper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import requests
from bs4 import BeautifulSoup
import sqlite3
import os
from queue import Queue

# Define a new exception for interrupting scraping
class ScrapingInterrupt(Exception):
Expand Down
Empty file.
10 changes: 10 additions & 0 deletions job_scraper/scraper_selectors/workday_selectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from enum import StrEnum


class WorkDaySelectors(StrEnum):
JOB_LISTING_XPATH = '//li[@class="css-1q2dra3"]'
JOB_TITLE_XPATH = './/h3/a'
JOB_ID_XPATH = './/ul[@data-automation-id="subtitle"]/li'
POSTED_ON_XAPTH = './/dd[@class="css-129m7dg"][preceding-sibling::dt[contains(text(),"posted on")]]'
JOB_DESCRIPTION_XPATH = '//div[@data-automation-id="jobPostingDescription"]'
NEXT_PAGE_XPATH = "//button[@data-uxi-element-id='next']"
13 changes: 13 additions & 0 deletions job_scraper/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
def get_workday_company_urls() -> dict:
urls = {
'NVIDIA': 'https://nvidia.wd5.myworkdayjobs.com/NVIDIAExternalCareerSite?jobFamilyGroup=0c40f6bd1d8f10ae43ffaefd46dc7e78',
'SALESFORCE': 'https://salesforce.wd12.myworkdayjobs.com/en-US/External_Career_Site/details/Lead-Marketing-Cloud-Solution-Engineer_JR268932?jobFamilyGroup=14fa3452ec7c1011f90d0002a2100000',
'RED_HAT': 'https://redhat.wd5.myworkdayjobs.com/Jobs',
'CROWDSTRIKE': 'https://crowdstrike.wd5.myworkdayjobs.com/crowdstrikecareers'
}
return urls

def get_workday_post_time_range() -> list[str]:
return ['posted today', 'posted yesterday', 'posted 2 days ago', 'posted 3 days ago',
'posted 4 days ago', 'posted 5 days ago', 'posted 6 days ago', 'posted 7 days ago']

Empty file added job_scraper/waas/__init__.py
Empty file.
File renamed without changes.
Empty file added job_scraper/workday/__init__.py
Empty file.
129 changes: 129 additions & 0 deletions job_scraper/workday/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import sqlite3
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from job_scraper.scraper_selectors.workday_selectors import WorkDaySelectors
from job_scraper.utils import get_workday_post_time_range, get_workday_company_urls


class WorkdayScraper:
def __init__(self, db_path='job_listings.db', update_func=None, done_event=None, result_queue=None):
self.db_path = db_path
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.get_selenium_configs())
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very helpful 👍🏼

self.one_week_span_text = get_workday_post_time_range()
self.company_urls = get_workday_company_urls()
self.new_entries_count = 0
self.done_event = done_event
self.result_queue = result_queue
self.update_func = update_func
self.job_listings = []

@staticmethod
def get_selenium_configs() -> Options:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
return chrome_options

def save_to_database(self, original_text, original_html, source, external_id):
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("INSERT OR IGNORE INTO job_listings (original_text, original_html, source, external_id) VALUES (?, ?, ?, ?)",
(original_text, original_html, source, external_id))
conn.commit()
conn.close()
return c.rowcount > 0

def save_new_job_listing(self, job_description, job_description_html, job_url, job_id):
if not job_description:
return
if not job_description_html:
return
if not job_url:
return
if not job_id:
return
self.job_listings.append({
'original_text': job_description,
'original_html': job_description_html,
'source': job_url,
'external_id': job_id
})

def save_job_listings_to_db(self):
for job in self.job_listings:
inserted = self.save_to_database(
job['original_text'],
job['original_html'],
job['source'],
job['external_id']
)
if inserted:
self.new_entries_count += 1
if self.done_event:
self.result_queue.put(self.new_entries_count)
self.done_event.set()

def scrape(self):
self.update_func(f"Scraping Workday companies:\t{", ".join(self.company_urls.keys())}")

for company_name, company_url in self.company_urls.items():
self.driver.get(company_url)
wait = WebDriverWait(self.driver, 10)

posted_this_week = True
while posted_this_week:
try:
wait.until(EC.presence_of_element_located((By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)))
except TimeoutException:
self.update_func("Job Listing Element not found. Try again later")
break

job_elements = self.driver.find_elements(By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)
for job_element in job_elements:
try:
self.update_func(f"Scraping {company_name}: {self.driver.current_url}")
job_title_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_TITLE_XPATH)
job_id_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_ID_XPATH)
job_id = job_id_element.text
posted_on_element = job_element.find_element(By.XPATH, WorkDaySelectors.POSTED_ON_XAPTH)
posted_on = posted_on_element.text

if posted_on.lower() in self.one_week_span_text:
job_url = job_title_element.get_attribute('href')
job_title_element.click()
job_description_element = wait.until(
EC.presence_of_element_located((By.XPATH, WorkDaySelectors.JOB_DESCRIPTION_XPATH))
)
job_description = job_description_element.text
job_description_html = job_description_element.get_attribute("innerHTML")
self.save_new_job_listing(job_description, job_description_html, job_url, job_id)
else:
posted_this_week = False
break
except StaleElementReferenceException:
continue

if not posted_this_week:
break

try:
next_page_button = wait.until(
EC.element_to_be_clickable((By.XPATH, WorkDaySelectors.NEXT_PAGE_XPATH))
)
next_page_button.click()
except TimeoutException:
self.update_func("TimeoutException. Please try again later!")
break

self.save_job_listings_to_db()
self.update_func("Scraping completed for all companies.")
Empty file added src/__init__.py
Empty file.
Loading