From b0a86d9a8b2a60319bddf7a0156c5a94b965812e Mon Sep 17 00:00:00 2001 From: nicolasstrands Date: Mon, 7 Oct 2024 01:21:20 +0400 Subject: [PATCH] feat: added support for webpage links added support for scraping webpage links and parsing data using OpenAI --- .github/workflows/scrape.yml | 2 + .gitignore | 3 +- README.md | 5 +- fetch.py | 214 ++++++++++++++++++++++++++--------- links.json | 89 +++++++++++---- public_holidays.json | 168 +++++++++++++++++++++++++++ requirements.txt | 4 +- 7 files changed, 408 insertions(+), 77 deletions(-) create mode 100644 public_holidays.json diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml index ac6e1f1..1e8188d 100644 --- a/.github/workflows/scrape.yml +++ b/.github/workflows/scrape.yml @@ -22,6 +22,8 @@ jobs: path: "./requirements.txt" - name: Running script run: python fetch.py + env: + SPECIAL_OPENAI_KEY: ${{ secrets.SPECIAL_OPENAI_KEY }} - name: Commit and push changes run: | git config user.name "Automated" diff --git a/.gitignore b/.gitignore index 38f434f..ec8fc7d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .vscode .DS_Store -path/ \ No newline at end of file +path/ +test.py \ No newline at end of file diff --git a/README.md b/README.md index aa3bea7..eaab0a6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ ## Dataset -### [View Dataset](https://github.com/nicolasstrands/mauritius-public-holidays-dataset/blob/main/data/public-holidays.json) +### View Datasets +- [Mauritius](https://github.com/nicolasstrands/mauritius-public-holidays-dataset/blob/main/data/public-holidays-mauritius.json) The aim of this repository and dataset is to provide a hassle-free way to use the data to build applications. @@ -36,4 +37,4 @@ The aim of this repository and dataset is to provide a hassle-free way to use th -** The exact date of this festival is subject to confirmation as its celebration depends on the visibility of the moon. +** The exact date of this festival is subject to confirmation as its celebration depends on the visibility of the moon. \ No newline at end of file diff --git a/fetch.py b/fetch.py index e373c48..5053a05 100644 --- a/fetch.py +++ b/fetch.py @@ -1,76 +1,182 @@ +import os import io import json import pathlib +import sys from datetime import datetime from concurrent.futures import ThreadPoolExecutor +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +import requests import pypdf import pandas as pd -import requests import tabula +import openai +from bs4 import BeautifulSoup -# Read the links from the JSON file using a context manager +# Load links from a JSON file with open("links.json") as f: links_data = json.load(f) -links = links_data["links"] # Initialize the result dictionary -result_by_year = {} - - -def process_link(link): - for year, url in link.items(): +result_by_country = {} + + +# Fetch webpage content +def fetch_webpage_content(url): + try: + response = requests.get(url) + response.raise_for_status() + return response.text + except requests.exceptions.RequestException as e: + print(f"Error fetching the webpage: {e}") + return None + + +def extract_relevant_text(html_content, html_selector, html_attribute, html_value): + """ + Parses the HTML content and extracts the relevant text containing public holidays. + Adjust the parsing logic based on the webpage structure. + """ + soup = BeautifulSoup(html_content, "html.parser") + + # Example: Assuming public holidays are listed within a specific table or div + # You'll need to inspect the actual webpage to identify the correct tags and classes/ids + holidays_section = soup.find( + html_selector, {html_attribute: html_value} + ) # Modify as needed + + if not holidays_section: + print("Could not find the holidays section in the webpage.") + sys.exit(1) + + text_content = holidays_section.get_text(separator="\n") + print("Successfully extracted relevant text from the webpage.") + return text_content + + +# Extract holidays using OpenAI +def extract_holidays_with_openai(content, country, year): + openai.api_key = os.getenv("SPECIAL_OPENAI_KEY") + if not openai.api_key: + print("Error: SPECIAL_OPENAI_KEY environment variable not set.") + sys.exit(1) + + prompt = f""" + Extract the public holidays for {country} in the year {year} from the following text. + Provide the data in JSON format with the structure: + {{ + "{year}": [ + {{ + "name": "Holiday Name", + "date": "YYYY-MM-DD", + }}, + ... + ] + }} + + Here is the text: + {content} + """ + + try: + response = openai.ChatCompletion.create( + model="gpt-4o-mini", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that extracts public holidays from text.", + }, + {"role": "user", "content": prompt}, + ], + max_tokens=2000, + temperature=0, + ) + + return response["choices"][0]["message"]["content"] + except openai.error.OpenAIError as e: + print(f"OpenAI API error: {e}") + return None + + +# Validate and load JSON +def validate_and_load_json(json_str): + try: + json_start = json_str.find("{") + json_end = json_str.rfind("}") + 1 + json_clean = json_str[json_start:json_end] + return json.loads(json_clean) + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + return None + + +# Process each link (PDF or webpage) +def process_link(country, link): + for year, link_info in link.items(): + url = link_info["url"] + link_type = link_info["type"] try: - # Read the PDF file from the URL using requests library - response = requests.get(url) - response.raise_for_status() - pdf_file = io.BytesIO(response.content) - - # Create a pypdf PdfFileReader object from the PDF file - pdf_reader = pypdf.PdfReader(pdf_file) - - # Extract the table data from the first page of the PDF using tabula-py - tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None}) - - # Convert the table data to a pandas DataFrame - df = pd.DataFrame(tables[0]) - - # Convert the DataFrame to a list of dictionaries - table_data = df.to_dict(orient="records") - - # Create a dictionary with the key-value pairs - result = [] - for row in table_data: - row[1] += f" {year}" - result.append( - { - "name": row[0].title(), - "date": datetime.strptime(row[1], "%A %d %B %Y").strftime( - "%Y-%m-%d" - ), - } - ) - - # Group the result by year - if year not in result_by_year: - result_by_year[year] = [] - result_by_year[year].extend(result) - except requests.RequestException as e: - print(f"Failed to fetch {url}: {e}") + if link_type == "pdf": + response = requests.get(url) + response.raise_for_status() + pdf_file = io.BytesIO(response.content) + pdf_reader = pypdf.PdfReader(pdf_file) + tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None}) + df = pd.DataFrame(tables[0]) + table_data = df.to_dict(orient="records") + + result = [] + for row in table_data: + row[1] += f" {year}" + result.append( + { + "name": row[0].title(), + "date": datetime.strptime(row[1], "%A %d %B %Y").strftime( + "%Y-%m-%d" + ), + } + ) + + if country not in result_by_country: + result_by_country[country] = {} + if year not in result_by_country[country]: + result_by_country[country][year] = [] + result_by_country[country][year].extend(result) + + elif link_type == "webpage": + content = fetch_webpage_content(url) + if content: + relevant_text = extract_relevant_text( + content, link_info["tag"], link_info["attr"], link_info["value"] + ) + if relevant_text: + extracted_json_str = extract_holidays_with_openai( + relevant_text, country, year + ) + data = validate_and_load_json(extracted_json_str) + if data: + if country not in result_by_country: + result_by_country[country] = {} + if year not in result_by_country[country]: + result_by_country[country][year] = [] + result_by_country[country][year].extend(data[year]) + except Exception as e: print(f"Error processing {url}: {e}") -# Use ThreadPoolExecutor to process links concurrently +# Process links concurrently with ThreadPoolExecutor() as executor: - executor.map(process_link, links) + for country, links in links_data["countries"].items(): + executor.map(lambda link: process_link(country, link), links) -# Sort the result by year -sorted_result_by_year = dict(sorted(result_by_year.items())) - -# Create the data folder if it does not exist using pathlib library +# Create the data folder and save output pathlib.Path("data").mkdir(parents=True, exist_ok=True) - -# Write the output to a JSON file in the data folder using pathlib library -with open(pathlib.Path("data") / "public-holidays.json", "w") as outfile: - json.dump(sorted_result_by_year, outfile, indent=4) +for country, data in result_by_country.items(): + sorted_result_by_year = dict(sorted(data.items())) + with open(pathlib.Path("data") / f"public-holidays-{country}.json", "w") as outfile: + json.dump(sorted_result_by_year, outfile, indent=4) diff --git a/links.json b/links.json index fa7a903..7516458 100644 --- a/links.json +++ b/links.json @@ -1,20 +1,71 @@ { - "links": [ - { - "2020": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf" - }, - { - "2021": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf" - }, - { - "2022": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf" - }, - { "2023": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf" }, - { - "2024": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf" - }, - { - "2025": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf" - } - ] -} + "countries": { + "mu": [ + { + "2020": { + "url": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf", + "type": "pdf" + } + }, + { + "2021": { + "url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf", + "type": "pdf" + } + }, + { + "2022": { + "url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf", + "type": "pdf" + } + }, + { + "2023": { + "url": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf", + "type": "pdf" + } + }, + { + "2024": { + "url": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf", + "type": "pdf" + } + }, + { + "2025": { + "url": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf", + "type": "pdf" + } + } + ], + "sa": [ + { + "2024": { + "url": "https://www.gov.za/about-sa/public-holidays", + "type": "webpage", + "tag": "div", + "attr": "id", + "value": "block-eco-omega-system-main" + } + }, + { + "2025": { + "url": "https://www.gov.za/about-sa/public-holidays", + "type": "webpage", + "tag": "div", + "attr": "id", + "value": "block-eco-omega-system-main" + } + }, + { + "2026": { + "url": "https://www.gov.za/about-sa/public-holidays", + "type": "webpage", + "tag": "div", + "attr": "id", + "value": "block-eco-omega-system-main" + } + } + ] + } +} \ No newline at end of file diff --git a/public_holidays.json b/public_holidays.json new file mode 100644 index 0000000..54f1159 --- /dev/null +++ b/public_holidays.json @@ -0,0 +1,168 @@ +{ + "2024": [ + { + "name": "New Year’s Day", + "date": "2024-01-01" + }, + { + "name": "Human Rights Day", + "date": "2024-03-21" + }, + { + "name": "Good Friday", + "date": "2024-03-29" + }, + { + "name": "Family Day", + "date": "2024-04-01" + }, + { + "name": "Freedom Day", + "date": "2024-04-27" + }, + { + "name": "Workers' Day", + "date": "2024-05-01" + }, + { + "name": "Public holiday (General Elections)", + "date": "2024-05-29" + }, + { + "name": "Youth Day", + "date": "2024-06-16" + }, + { + "name": "Public holiday Youth Day observed", + "date": "2024-06-17" + }, + { + "name": "National Women’s Day", + "date": "2024-08-09" + }, + { + "name": "Heritage Day", + "date": "2024-09-24" + }, + { + "name": "Day of Reconciliation", + "date": "2024-12-16" + }, + { + "name": "Christmas Day", + "date": "2024-12-25" + }, + { + "name": "Day of Goodwill", + "date": "2024-12-26" + } + ], + "2025": [ + { + "name": "New Year’s Day", + "date": "2025-01-01" + }, + { + "name": "Human Rights Day", + "date": "2025-03-21" + }, + { + "name": "Good Friday", + "date": "2025-04-18" + }, + { + "name": "Family Day", + "date": "2025-04-21" + }, + { + "name": "Freedom Day", + "date": "2025-04-27" + }, + { + "name": "Public holiday Freedom Day observed", + "date": "2025-04-28" + }, + { + "name": "Workers' Day", + "date": "2025-05-01" + }, + { + "name": "Youth Day", + "date": "2025-06-16" + }, + { + "name": "National Women’s Day", + "date": "2025-08-09" + }, + { + "name": "Heritage Day", + "date": "2025-09-24" + }, + { + "name": "Day of Reconciliation", + "date": "2025-12-16" + }, + { + "name": "Christmas Day", + "date": "2025-12-25" + }, + { + "name": "Day of Goodwill", + "date": "2025-12-26" + } + ], + "2026": [ + { + "name": "New Year’s Day", + "date": "2026-01-01" + }, + { + "name": "Human Rights Day", + "date": "2026-03-21" + }, + { + "name": "Good Friday", + "date": "2026-04-03" + }, + { + "name": "Family Day", + "date": "2026-04-06" + }, + { + "name": "Freedom Day", + "date": "2026-04-27" + }, + { + "name": "Workers' Day", + "date": "2026-05-01" + }, + { + "name": "Youth Day", + "date": "2026-06-16" + }, + { + "name": "National Women’s Day", + "date": "2026-08-09" + }, + { + "name": "Public holiday National Women’s Day observed", + "date": "2026-08-10" + }, + { + "name": "Heritage Day", + "date": "2026-09-24" + }, + { + "name": "Day of Reconciliation", + "date": "2026-12-16" + }, + { + "name": "Christmas Day", + "date": "2026-12-25" + }, + { + "name": "Day of Goodwill", + "date": "2026-12-26" + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d3a7e82..c9e575b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ pypdf python-dotenv==1.0.0 tabula==1.0.5 tabula_py==2.7.0 -requests \ No newline at end of file +requests +openai==0.28 +bs4 \ No newline at end of file