Skip to content

Commit

Permalink
feat: added support for webpage links
Browse files Browse the repository at this point in the history
added support for scraping webpage links and parsing data using OpenAI
  • Loading branch information
nicolasstrands committed Oct 6, 2024
1 parent 7b32e5c commit b0a86d9
Show file tree
Hide file tree
Showing 7 changed files with 408 additions and 77 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ jobs:
path: "./requirements.txt"
- name: Running script
run: python fetch.py
env:
SPECIAL_OPENAI_KEY: ${{ secrets.SPECIAL_OPENAI_KEY }}
- name: Commit and push changes
run: |
git config user.name "Automated"
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.vscode
.DS_Store
path/
path/
test.py
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

## Dataset

### [View Dataset](https://github.com/nicolasstrands/mauritius-public-holidays-dataset/blob/main/data/public-holidays.json)
### View Datasets
- [Mauritius](https://github.com/nicolasstrands/mauritius-public-holidays-dataset/blob/main/data/public-holidays-mauritius.json)

The aim of this repository and dataset is to provide a hassle-free way to use the data to build applications.

Expand Down Expand Up @@ -36,4 +37,4 @@ The aim of this repository and dataset is to provide a hassle-free way to use th

</details>

** The exact date of this festival is subject to confirmation as its celebration depends on the visibility of the moon.
** The exact date of this festival is subject to confirmation as its celebration depends on the visibility of the moon.
214 changes: 160 additions & 54 deletions fetch.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,182 @@
import os
import io
import json
import pathlib
import sys
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

import requests
import pypdf
import pandas as pd
import requests
import tabula
import openai
from bs4 import BeautifulSoup

# Read the links from the JSON file using a context manager
# Load links from a JSON file
with open("links.json") as f:
links_data = json.load(f)
links = links_data["links"]

# Initialize the result dictionary
result_by_year = {}


def process_link(link):
for year, url in link.items():
result_by_country = {}


# Fetch webpage content
def fetch_webpage_content(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching the webpage: {e}")
return None


def extract_relevant_text(html_content, html_selector, html_attribute, html_value):
"""
Parses the HTML content and extracts the relevant text containing public holidays.
Adjust the parsing logic based on the webpage structure.
"""
soup = BeautifulSoup(html_content, "html.parser")

# Example: Assuming public holidays are listed within a specific table or div
# You'll need to inspect the actual webpage to identify the correct tags and classes/ids
holidays_section = soup.find(
html_selector, {html_attribute: html_value}
) # Modify as needed

if not holidays_section:
print("Could not find the holidays section in the webpage.")
sys.exit(1)

text_content = holidays_section.get_text(separator="\n")
print("Successfully extracted relevant text from the webpage.")
return text_content


# Extract holidays using OpenAI
def extract_holidays_with_openai(content, country, year):
openai.api_key = os.getenv("SPECIAL_OPENAI_KEY")
if not openai.api_key:
print("Error: SPECIAL_OPENAI_KEY environment variable not set.")
sys.exit(1)

prompt = f"""
Extract the public holidays for {country} in the year {year} from the following text.
Provide the data in JSON format with the structure:
{{
"{year}": [
{{
"name": "Holiday Name",
"date": "YYYY-MM-DD",
}},
...
]
}}
Here is the text:
{content}
"""

try:
response = openai.ChatCompletion.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that extracts public holidays from text.",
},
{"role": "user", "content": prompt},
],
max_tokens=2000,
temperature=0,
)

return response["choices"][0]["message"]["content"]
except openai.error.OpenAIError as e:
print(f"OpenAI API error: {e}")
return None


# Validate and load JSON
def validate_and_load_json(json_str):
try:
json_start = json_str.find("{")
json_end = json_str.rfind("}") + 1
json_clean = json_str[json_start:json_end]
return json.loads(json_clean)
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
return None


# Process each link (PDF or webpage)
def process_link(country, link):
for year, link_info in link.items():
url = link_info["url"]
link_type = link_info["type"]
try:
# Read the PDF file from the URL using requests library
response = requests.get(url)
response.raise_for_status()
pdf_file = io.BytesIO(response.content)

# Create a pypdf PdfFileReader object from the PDF file
pdf_reader = pypdf.PdfReader(pdf_file)

# Extract the table data from the first page of the PDF using tabula-py
tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None})

# Convert the table data to a pandas DataFrame
df = pd.DataFrame(tables[0])

# Convert the DataFrame to a list of dictionaries
table_data = df.to_dict(orient="records")

# Create a dictionary with the key-value pairs
result = []
for row in table_data:
row[1] += f" {year}"
result.append(
{
"name": row[0].title(),
"date": datetime.strptime(row[1], "%A %d %B %Y").strftime(
"%Y-%m-%d"
),
}
)

# Group the result by year
if year not in result_by_year:
result_by_year[year] = []
result_by_year[year].extend(result)
except requests.RequestException as e:
print(f"Failed to fetch {url}: {e}")
if link_type == "pdf":
response = requests.get(url)
response.raise_for_status()
pdf_file = io.BytesIO(response.content)
pdf_reader = pypdf.PdfReader(pdf_file)
tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None})
df = pd.DataFrame(tables[0])
table_data = df.to_dict(orient="records")

result = []
for row in table_data:
row[1] += f" {year}"
result.append(
{
"name": row[0].title(),
"date": datetime.strptime(row[1], "%A %d %B %Y").strftime(
"%Y-%m-%d"
),
}
)

if country not in result_by_country:
result_by_country[country] = {}
if year not in result_by_country[country]:
result_by_country[country][year] = []
result_by_country[country][year].extend(result)

elif link_type == "webpage":
content = fetch_webpage_content(url)
if content:
relevant_text = extract_relevant_text(
content, link_info["tag"], link_info["attr"], link_info["value"]
)
if relevant_text:
extracted_json_str = extract_holidays_with_openai(
relevant_text, country, year
)
data = validate_and_load_json(extracted_json_str)
if data:
if country not in result_by_country:
result_by_country[country] = {}
if year not in result_by_country[country]:
result_by_country[country][year] = []
result_by_country[country][year].extend(data[year])

except Exception as e:
print(f"Error processing {url}: {e}")


# Use ThreadPoolExecutor to process links concurrently
# Process links concurrently
with ThreadPoolExecutor() as executor:
executor.map(process_link, links)
for country, links in links_data["countries"].items():
executor.map(lambda link: process_link(country, link), links)

# Sort the result by year
sorted_result_by_year = dict(sorted(result_by_year.items()))

# Create the data folder if it does not exist using pathlib library
# Create the data folder and save output
pathlib.Path("data").mkdir(parents=True, exist_ok=True)

# Write the output to a JSON file in the data folder using pathlib library
with open(pathlib.Path("data") / "public-holidays.json", "w") as outfile:
json.dump(sorted_result_by_year, outfile, indent=4)
for country, data in result_by_country.items():
sorted_result_by_year = dict(sorted(data.items()))
with open(pathlib.Path("data") / f"public-holidays-{country}.json", "w") as outfile:
json.dump(sorted_result_by_year, outfile, indent=4)
89 changes: 70 additions & 19 deletions links.json
Original file line number Diff line number Diff line change
@@ -1,20 +1,71 @@
{
"links": [
{
"2020": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf"
},
{
"2021": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf"
},
{
"2022": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf"
},
{ "2023": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf" },
{
"2024": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf"
},
{
"2025": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf"
}
]
}
"countries": {
"mu": [
{
"2020": {
"url": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf",
"type": "pdf"
}
},
{
"2021": {
"url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf",
"type": "pdf"
}
},
{
"2022": {
"url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf",
"type": "pdf"
}
},
{
"2023": {
"url": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf",
"type": "pdf"
}
},
{
"2024": {
"url": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf",
"type": "pdf"
}
},
{
"2025": {
"url": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf",
"type": "pdf"
}
}
],
"sa": [
{
"2024": {
"url": "https://www.gov.za/about-sa/public-holidays",
"type": "webpage",
"tag": "div",
"attr": "id",
"value": "block-eco-omega-system-main"
}
},
{
"2025": {
"url": "https://www.gov.za/about-sa/public-holidays",
"type": "webpage",
"tag": "div",
"attr": "id",
"value": "block-eco-omega-system-main"
}
},
{
"2026": {
"url": "https://www.gov.za/about-sa/public-holidays",
"type": "webpage",
"tag": "div",
"attr": "id",
"value": "block-eco-omega-system-main"
}
}
]
}
}
Loading

0 comments on commit b0a86d9

Please sign in to comment.