feat: added support for webpage links

added support for scraping webpage links and parsing data using OpenAI
nicolasstrands · Oct 6, 2024 · b0a86d9 · b0a86d9
1 parent 7b32e5c
commit b0a86d9
Show file tree

Hide file tree

Showing 7 changed files with 408 additions and 77 deletions.
diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml
@@ -22,6 +22,8 @@ jobs:
           path: "./requirements.txt"
       - name: Running script
         run: python fetch.py
+        env:
+          SPECIAL_OPENAI_KEY: ${{ secrets.SPECIAL_OPENAI_KEY }}
       - name: Commit and push changes
         run: |
           git config user.name "Automated"

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .vscode
 .DS_Store
-path/
+path/
+test.py
diff --git a/README.md b/README.md
@@ -2,7 +2,8 @@
 
 ## Dataset
 
-### [View Dataset](https://github.com/nicolasstrands/mauritius-public-holidays-dataset/blob/main/data/public-holidays.json)
+### View Datasets
+- [Mauritius](https://github.com/nicolasstrands/mauritius-public-holidays-dataset/blob/main/data/public-holidays-mauritius.json)
 
 The aim of this repository and dataset is to provide a hassle-free way to use the data to build applications.
 
@@ -36,4 +37,4 @@ The aim of this repository and dataset is to provide a hassle-free way to use th
 
 </details>
 
-** The exact date of this festival is subject to confirmation as its celebration depends on the visibility of the moon.
+** The exact date of this festival is subject to confirmation as its celebration depends on the visibility of the moon.
diff --git a/fetch.py b/fetch.py
@@ -1,76 +1,182 @@
+import os
 import io
 import json
 import pathlib
+import sys
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor
 
+import warnings
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+import requests
 import pypdf
 import pandas as pd
-import requests
 import tabula
+import openai
+from bs4 import BeautifulSoup
 
-# Read the links from the JSON file using a context manager
+# Load links from a JSON file
 with open("links.json") as f:
     links_data = json.load(f)
-links = links_data["links"]
 
 # Initialize the result dictionary
-result_by_year = {}
-
-
-def process_link(link):
-    for year, url in link.items():
+result_by_country = {}
+
+
+# Fetch webpage content
+def fetch_webpage_content(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching the webpage: {e}")
+        return None
+
+
+def extract_relevant_text(html_content, html_selector, html_attribute, html_value):
+    """
+    Parses the HTML content and extracts the relevant text containing public holidays.
+    Adjust the parsing logic based on the webpage structure.
+    """
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # Example: Assuming public holidays are listed within a specific table or div
+    # You'll need to inspect the actual webpage to identify the correct tags and classes/ids
+    holidays_section = soup.find(
+        html_selector, {html_attribute: html_value}
+    )  # Modify as needed
+
+    if not holidays_section:
+        print("Could not find the holidays section in the webpage.")
+        sys.exit(1)
+
+    text_content = holidays_section.get_text(separator="\n")
+    print("Successfully extracted relevant text from the webpage.")
+    return text_content
+
+
+# Extract holidays using OpenAI
+def extract_holidays_with_openai(content, country, year):
+    openai.api_key = os.getenv("SPECIAL_OPENAI_KEY")
+    if not openai.api_key:
+        print("Error: SPECIAL_OPENAI_KEY environment variable not set.")
+        sys.exit(1)
+
+    prompt = f"""
+    Extract the public holidays for {country} in the year {year} from the following text.
+    Provide the data in JSON format with the structure:
+    {{
+        "{year}": [
+            {{
+                "name": "Holiday Name",
+                "date": "YYYY-MM-DD",
+            }},
+            ...
+        ]
+    }}
+    
+    Here is the text:
+    {content}
+    """
+
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant that extracts public holidays from text.",
+                },
+                {"role": "user", "content": prompt},
+            ],
+            max_tokens=2000,
+            temperature=0,
+        )
+
+        return response["choices"][0]["message"]["content"]
+    except openai.error.OpenAIError as e:
+        print(f"OpenAI API error: {e}")
+        return None
+
+
+# Validate and load JSON
+def validate_and_load_json(json_str):
+    try:
+        json_start = json_str.find("{")
+        json_end = json_str.rfind("}") + 1
+        json_clean = json_str[json_start:json_end]
+        return json.loads(json_clean)
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON: {e}")
+        return None
+
+
+# Process each link (PDF or webpage)
+def process_link(country, link):
+    for year, link_info in link.items():
+        url = link_info["url"]
+        link_type = link_info["type"]
         try:
-            # Read the PDF file from the URL using requests library
-            response = requests.get(url)
-            response.raise_for_status()
-            pdf_file = io.BytesIO(response.content)
-
-            # Create a pypdf PdfFileReader object from the PDF file
-            pdf_reader = pypdf.PdfReader(pdf_file)
-
-            # Extract the table data from the first page of the PDF using tabula-py
-            tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None})
-
-            # Convert the table data to a pandas DataFrame
-            df = pd.DataFrame(tables[0])
-
-            # Convert the DataFrame to a list of dictionaries
-            table_data = df.to_dict(orient="records")
-
-            # Create a dictionary with the key-value pairs
-            result = []
-            for row in table_data:
-                row[1] += f" {year}"
-                result.append(
-                    {
-                        "name": row[0].title(),
-                        "date": datetime.strptime(row[1], "%A %d %B %Y").strftime(
-                            "%Y-%m-%d"
-                        ),
-                    }
-                )
-
-            # Group the result by year
-            if year not in result_by_year:
-                result_by_year[year] = []
-            result_by_year[year].extend(result)
-        except requests.RequestException as e:
-            print(f"Failed to fetch {url}: {e}")
+            if link_type == "pdf":
+                response = requests.get(url)
+                response.raise_for_status()
+                pdf_file = io.BytesIO(response.content)
+                pdf_reader = pypdf.PdfReader(pdf_file)
+                tables = tabula.read_pdf(url, pages=1, pandas_options={"header": None})
+                df = pd.DataFrame(tables[0])
+                table_data = df.to_dict(orient="records")
+
+                result = []
+                for row in table_data:
+                    row[1] += f" {year}"
+                    result.append(
+                        {
+                            "name": row[0].title(),
+                            "date": datetime.strptime(row[1], "%A %d %B %Y").strftime(
+                                "%Y-%m-%d"
+                            ),
+                        }
+                    )
+
+                if country not in result_by_country:
+                    result_by_country[country] = {}
+                if year not in result_by_country[country]:
+                    result_by_country[country][year] = []
+                result_by_country[country][year].extend(result)
+
+            elif link_type == "webpage":
+                content = fetch_webpage_content(url)
+                if content:
+                    relevant_text = extract_relevant_text(
+                        content, link_info["tag"], link_info["attr"], link_info["value"]
+                    )
+                    if relevant_text:
+                        extracted_json_str = extract_holidays_with_openai(
+                            relevant_text, country, year
+                        )
+                        data = validate_and_load_json(extracted_json_str)
+                        if data:
+                            if country not in result_by_country:
+                                result_by_country[country] = {}
+                            if year not in result_by_country[country]:
+                                result_by_country[country][year] = []
+                            result_by_country[country][year].extend(data[year])
+
         except Exception as e:
             print(f"Error processing {url}: {e}")
 
 
-# Use ThreadPoolExecutor to process links concurrently
+# Process links concurrently
 with ThreadPoolExecutor() as executor:
-    executor.map(process_link, links)
+    for country, links in links_data["countries"].items():
+        executor.map(lambda link: process_link(country, link), links)
 
-# Sort the result by year
-sorted_result_by_year = dict(sorted(result_by_year.items()))
-
-# Create the data folder if it does not exist using pathlib library
+# Create the data folder and save output
 pathlib.Path("data").mkdir(parents=True, exist_ok=True)
-
-# Write the output to a JSON file in the data folder using pathlib library
-with open(pathlib.Path("data") / "public-holidays.json", "w") as outfile:
-    json.dump(sorted_result_by_year, outfile, indent=4)
+for country, data in result_by_country.items():
+    sorted_result_by_year = dict(sorted(data.items()))
+    with open(pathlib.Path("data") / f"public-holidays-{country}.json", "w") as outfile:
+        json.dump(sorted_result_by_year, outfile, indent=4)
diff --git a/links.json b/links.json
@@ -1,20 +1,71 @@
 {
-  "links": [
-    {
-      "2020": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf"
-    },
-    {
-      "2021": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf"
-    },
-    {
-      "2022": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf"
-    },
-    { "2023": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf" },
-    {
-      "2024": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf"
-    },
-    {
-      "2025": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf"
-    }
-  ]
-}
+  "countries": {
+    "mu": [
+      {
+        "2020": {
+          "url": "https://pmo.govmu.org/Communique/Public%20holidays%20--%202020.pdf",
+          "type": "pdf"
+        }
+      },
+      {
+        "2021": {
+          "url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202021.pdf",
+          "type": "pdf"
+        }
+      },
+      {
+        "2022": {
+          "url": "https://pmo.govmu.org/Communique/Public%20holidays%20-%202022.pdf",
+          "type": "pdf"
+        }
+      },
+      {
+        "2023": {
+          "url": "https://pmo.govmu.org/Communique/Public_Holidays_2023.pdf",
+          "type": "pdf"
+        }
+      },
+      {
+        "2024": {
+          "url": "https://pmo.govmu.org/Communique/Notice%20-%20Final%20Public%20holidays%20-%202024.pdf",
+          "type": "pdf"
+        }
+      },
+      {
+        "2025": {
+          "url": "https://pmo.govmu.org/Communique/Notice-Public_Holidays_2025.pdf",
+          "type": "pdf"
+        }
+      }
+    ],
+    "sa": [
+      {
+        "2024": {
+          "url": "https://www.gov.za/about-sa/public-holidays",
+          "type": "webpage",
+          "tag": "div",
+          "attr": "id",
+          "value": "block-eco-omega-system-main"
+        }
+      },
+      {
+        "2025": {
+          "url": "https://www.gov.za/about-sa/public-holidays",
+          "type": "webpage",
+          "tag": "div",
+          "attr": "id",
+          "value": "block-eco-omega-system-main"
+        }
+      },
+      {
+        "2026": {
+          "url": "https://www.gov.za/about-sa/public-holidays",
+          "type": "webpage",
+          "tag": "div",
+          "attr": "id",
+          "value": "block-eco-omega-system-main"
+        }
+      }
+    ]
+  }
+}