-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: added support for webpage links
added support for scraping webpage links and parsing data using OpenAI
- Loading branch information
1 parent
7b32e5c
commit b0a86d9
Showing
7 changed files
with
408 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
.vscode | ||
.DS_Store | ||
path/ | ||
path/ | ||
test.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,76 +1,182 @@ | ||
import os | ||
import io | ||
import json | ||
import pathlib | ||
import sys | ||
from datetime import datetime | ||
from concurrent.futures import ThreadPoolExecutor | ||
|
||
import warnings | ||
|
||
warnings.simplefilter(action="ignore", category=FutureWarning) | ||
|
||
import requests | ||
import pypdf | ||
import pandas as pd | ||
import requests | ||
import tabula | ||
import openai | ||
from bs4 import BeautifulSoup | ||
|
||
# Read the links from the JSON file using a context manager | ||
# Load links from a JSON file | ||
with open("links.json") as f: | ||
links_data = json.load(f) | ||
links = links_data["links"] | ||
|
||
# Initialize the result dictionary | ||
result_by_year = {} | ||
|
||
|
||
def process_link(link): | ||
for year, url in link.items(): | ||
result_by_country = {} | ||
|
||
|
||
# Fetch webpage content | ||
def fetch_webpage_content(url): | ||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
return response.text | ||
except requests.exceptions.RequestException as e: | ||
print(f"Error fetching the webpage: {e}") | ||
return None | ||
|
||
|
||
def extract_relevant_text(html_content, html_selector, html_attribute, html_value): | ||
""" | ||
Parses the HTML content and extracts the relevant text containing public holidays. | ||
Adjust the parsing logic based on the webpage structure. | ||
""" | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
|
||
# Example: Assuming public holidays are listed within a specific table or div | ||
# You'll need to inspect the actual webpage to identify the correct tags and classes/ids | ||
holidays_section = soup.find( | ||
html_selector, {html_attribute: html_value} | ||
) # Modify as needed | ||
|
||
if not holidays_section: | ||
print("Could not find the holidays section in the webpage.") | ||
sys.exit(1) | ||
|
||
text_content = holidays_section.get_text(separator="\n") | ||
print("Successfully extracted relevant text from the webpage.") | ||
return text_content | ||
|
||
|
||
# Extract holidays using OpenAI | ||
def extract_holidays_with_openai(content, country, year): | ||
openai.api_key = os.getenv("SPECIAL_OPENAI_KEY") | ||
if not openai.api_key: | ||
print("Error: SPECIAL_OPENAI_KEY environment variable not set.") | ||
sys.exit(1) | ||
|
||
prompt = f""" | ||
Extract the public holidays for {country} in the year {year} from the following text. | ||
Provide the data in JSON format with the structure: | ||
{{ | ||
"{year}": [ | ||
{{ | ||
"name": "Holiday Name", | ||
"date": "YYYY-MM-DD", | ||
}}, | ||
... | ||
] | ||
}} | ||
Here is the text: | ||
{content} | ||
""" | ||
|
||
try: | ||
response = openai.ChatCompletion.create( | ||
model="gpt-4o-mini", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": "You are a helpful assistant that extracts public holidays from text.", | ||
}, | ||
{"role": "user", "content": prompt}, | ||
], | ||
max_tokens=2000, | ||
temperature=0, | ||
) | ||
|
||
return response["choices"][0]["message"]["content"] | ||
except openai.error.OpenAIError as e: | ||
print(f"OpenAI API error: {e}") | ||
return None | ||
|
||
|
||
# Validate and load JSON | ||
def validate_and_load_json(json_str): | ||
try: | ||
json_start = json_str.find("{") | ||
json_end = json_str.rfind("}") + 1 | ||
json_clean = json_str[json_start:json_end] | ||
return json.loads(json_clean) | ||
except json.JSONDecodeError as e: | ||
print(f"Error decoding JSON: {e}") | ||
return None | ||
|
||
|
||
# Process each link (PDF or webpage) | ||
def process_link(country, link): | ||
for year, link_info in link.items(): | ||
url = link_info["url"] | ||
link_type = link_info["type"] | ||
try: | ||
# Read the PDF file from the URL using requests library | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
pdf_file = io.BytesIO(response.content) | ||
|
||
# Create a pypdf PdfFileReader object from the PDF file | ||
pdf_reader = pypdf.PdfReader(pdf_file) | ||
|
||
# Extract the table data from the first page of the PDF using tabula-py | ||
tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None}) | ||
|
||
# Convert the table data to a pandas DataFrame | ||
df = pd.DataFrame(tables[0]) | ||
|
||
# Convert the DataFrame to a list of dictionaries | ||
table_data = df.to_dict(orient="records") | ||
|
||
# Create a dictionary with the key-value pairs | ||
result = [] | ||
for row in table_data: | ||
row[1] += f" {year}" | ||
result.append( | ||
{ | ||
"name": row[0].title(), | ||
"date": datetime.strptime(row[1], "%A %d %B %Y").strftime( | ||
"%Y-%m-%d" | ||
), | ||
} | ||
) | ||
|
||
# Group the result by year | ||
if year not in result_by_year: | ||
result_by_year[year] = [] | ||
result_by_year[year].extend(result) | ||
except requests.RequestException as e: | ||
print(f"Failed to fetch {url}: {e}") | ||
if link_type == "pdf": | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
pdf_file = io.BytesIO(response.content) | ||
pdf_reader = pypdf.PdfReader(pdf_file) | ||
tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None}) | ||
df = pd.DataFrame(tables[0]) | ||
table_data = df.to_dict(orient="records") | ||
|
||
result = [] | ||
for row in table_data: | ||
row[1] += f" {year}" | ||
result.append( | ||
{ | ||
"name": row[0].title(), | ||
"date": datetime.strptime(row[1], "%A %d %B %Y").strftime( | ||
"%Y-%m-%d" | ||
), | ||
} | ||
) | ||
|
||
if country not in result_by_country: | ||
result_by_country[country] = {} | ||
if year not in result_by_country[country]: | ||
result_by_country[country][year] = [] | ||
result_by_country[country][year].extend(result) | ||
|
||
elif link_type == "webpage": | ||
content = fetch_webpage_content(url) | ||
if content: | ||
relevant_text = extract_relevant_text( | ||
content, link_info["tag"], link_info["attr"], link_info["value"] | ||
) | ||
if relevant_text: | ||
extracted_json_str = extract_holidays_with_openai( | ||
relevant_text, country, year | ||
) | ||
data = validate_and_load_json(extracted_json_str) | ||
if data: | ||
if country not in result_by_country: | ||
result_by_country[country] = {} | ||
if year not in result_by_country[country]: | ||
result_by_country[country][year] = [] | ||
result_by_country[country][year].extend(data[year]) | ||
|
||
except Exception as e: | ||
print(f"Error processing {url}: {e}") | ||
|
||
|
||
# Use ThreadPoolExecutor to process links concurrently | ||
# Process links concurrently | ||
with ThreadPoolExecutor() as executor: | ||
executor.map(process_link, links) | ||
for country, links in links_data["countries"].items(): | ||
executor.map(lambda link: process_link(country, link), links) | ||
|
||
# Sort the result by year | ||
sorted_result_by_year = dict(sorted(result_by_year.items())) | ||
|
||
# Create the data folder if it does not exist using pathlib library | ||
# Create the data folder and save output | ||
pathlib.Path("data").mkdir(parents=True, exist_ok=True) | ||
|
||
# Write the output to a JSON file in the data folder using pathlib library | ||
with open(pathlib.Path("data") / "public-holidays.json", "w") as outfile: | ||
json.dump(sorted_result_by_year, outfile, indent=4) | ||
for country, data in result_by_country.items(): | ||
sorted_result_by_year = dict(sorted(data.items())) | ||
with open(pathlib.Path("data") / f"public-holidays-{country}.json", "w") as outfile: | ||
json.dump(sorted_result_by_year, outfile, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,71 @@ | ||
{ | ||
"links": [ | ||
{ | ||
"2020": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf" | ||
}, | ||
{ | ||
"2021": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf" | ||
}, | ||
{ | ||
"2022": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf" | ||
}, | ||
{ "2023": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf" }, | ||
{ | ||
"2024": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf" | ||
}, | ||
{ | ||
"2025": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf" | ||
} | ||
] | ||
} | ||
"countries": { | ||
"mu": [ | ||
{ | ||
"2020": { | ||
"url": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf", | ||
"type": "pdf" | ||
} | ||
}, | ||
{ | ||
"2021": { | ||
"url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf", | ||
"type": "pdf" | ||
} | ||
}, | ||
{ | ||
"2022": { | ||
"url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf", | ||
"type": "pdf" | ||
} | ||
}, | ||
{ | ||
"2023": { | ||
"url": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf", | ||
"type": "pdf" | ||
} | ||
}, | ||
{ | ||
"2024": { | ||
"url": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf", | ||
"type": "pdf" | ||
} | ||
}, | ||
{ | ||
"2025": { | ||
"url": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf", | ||
"type": "pdf" | ||
} | ||
} | ||
], | ||
"sa": [ | ||
{ | ||
"2024": { | ||
"url": "https://www.gov.za/about-sa/public-holidays", | ||
"type": "webpage", | ||
"tag": "div", | ||
"attr": "id", | ||
"value": "block-eco-omega-system-main" | ||
} | ||
}, | ||
{ | ||
"2025": { | ||
"url": "https://www.gov.za/about-sa/public-holidays", | ||
"type": "webpage", | ||
"tag": "div", | ||
"attr": "id", | ||
"value": "block-eco-omega-system-main" | ||
} | ||
}, | ||
{ | ||
"2026": { | ||
"url": "https://www.gov.za/about-sa/public-holidays", | ||
"type": "webpage", | ||
"tag": "div", | ||
"attr": "id", | ||
"value": "block-eco-omega-system-main" | ||
} | ||
} | ||
] | ||
} | ||
} |
Oops, something went wrong.