Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/workday_scraper #67

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion config/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ requests==2.25.1
openai
python-dotenv
windows-curses; sys_platform == 'win32'
selenium==4.25.0
selenium==4.25.0
webdriver-manager==4.0.2
5 changes: 5 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ WORKDIR /commandjobs
# Install any needed packages specified in requirements.txt
RUN pip3 install --no-cache-dir -r config/requirements.txt

# Install required packages, including Chromium and ChromeDriver
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent :)

RUN apt-get update && \
apt-get install -y wget unzip chromium chromium-driver && \
apt-get clean \

# Run menu.py when the container launches
CMD ["python3", "src/menu.py"]

17 changes: 5 additions & 12 deletions job_scraper/workday/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Expand All @@ -14,7 +16,7 @@
class WorkdayScraper:
def __init__(self, db_path='job_listings.db', update_func=None, done_event=None, result_queue=None):
self.db_path = db_path
self.driver = webdriver.Chrome(options=self.get_selenium_configs())
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.get_selenium_configs())
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very helpful 👍🏼

self.one_week_span_text = get_workday_post_time_range()
self.company_urls = get_workday_company_urls()
self.new_entries_count = 0
Expand All @@ -29,7 +31,6 @@ def get_selenium_configs() -> Options:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--disable-gpu")
return chrome_options

Expand Down Expand Up @@ -75,11 +76,10 @@ def save_job_listings_to_db(self):
self.done_event.set()

def scrape(self):
total_companies = len(self.company_urls)
for company_name, company_url in self.company_urls.items():
self.driver.get(company_url)
wait = WebDriverWait(self.driver, 10)
self.update_func(f"Scraping {company_name}...")
self.update_func(f"Scraping Workday companies:\t{", ".join(self.company_urls.keys())}")

posted_this_week = True
while posted_this_week:
Expand All @@ -92,6 +92,7 @@ def scrape(self):
job_elements = self.driver.find_elements(By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)
for job_element in job_elements:
try:
self.update_func(f"*{company_name}* \n {self.driver.current_url}")
job_title_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_TITLE_XPATH)
job_id_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_ID_XPATH)
job_id = job_id_element.text
Expand All @@ -113,7 +114,6 @@ def scrape(self):
except StaleElementReferenceException:
self.update_func("Encountered an issue while fetching job list. Retrying...")
time.sleep(1)
continue

if not posted_this_week:
break
Expand All @@ -127,12 +127,5 @@ def scrape(self):
self.update_func("TimeoutException. Please try again later!")
break

progress_percent = (company_name / total_companies) * 100
self.update_func(f"Scraping: {progress_percent:.0f}% - Completed.")

self.save_job_listings_to_db()
self.update_func("Scraping completed for all companies.")


scraper = WorkdayScraper()
scraper.scrape()
11 changes: 6 additions & 5 deletions src/menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def __init__(self, stdscr, logger):
if self.total_ai_job_recommendations > 0:
ai_recommendations_menu = f"✅ {self.total_ai_job_recommendations} recommended listings, out of {total_processed}"

self.menu_items = [resume_menu, "🕸 Scrape \"Ask HN: Who's hiring?\"",
self.menu_items = [resume_menu,
"🕸 Scrape \"Ask HN: Who's hiring?\"",
"🕸 Scrape \"Work at a Startup jobs\"",
"🕸 Scrape \"Workday\"",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😄

db_menu_item, find_best_matches_menu,
Expand Down Expand Up @@ -218,9 +219,9 @@ def update_menu_items(self):

# Update the relevant menu items
self.menu_items[0] = resume_menu
self.menu_items[3] = db_menu_item
self.menu_items[4] = find_best_matches_menu
self.menu_items[5] = ai_recommendations_menu
self.menu_items[4] = db_menu_item
self.menu_items[5] = find_best_matches_menu
self.menu_items[6] = ai_recommendations_menu

# Redraw the menu to reflect the updated items
self.draw_menu()
Expand All @@ -238,7 +239,7 @@ def execute_menu_action(self):
elif self.current_row == 2: # Scrape Work at a Startup jobs
self.start_scraping_WaaS_with_status_updates()
elif self.current_row == 3: # Scrape Workday
self.start_scraping_WaaS_with_status_updates()
self.start_scraping_workday_with_status_updates()
elif self.current_row == 4: # Navigate jobs in local db
draw_table(self.stdscr, self.db_path)
elif self.current_row == 5: # "Process job listings with GPT" option
Expand Down