Adding Web scrapper to NER

octoml · Aug 28, 2024 · 2b69270 · 2b69270
1 parent a1af100
commit 2b69270
Show file tree

Hide file tree

Showing 8 changed files with 2,769 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+**env.sh
diff --git a/README.md b/README.md
@@ -1,2 +1,15 @@
-# octoai-solutions
+# OctoAI Solutions
 A collection of reference solutions built on top of OctoAI SaaS
+
+## Project setup
+Some of the solutions are implemented as poetry projects.
+
+### Creating requirements.txt from poetry
+```bash
+poetry export --without-hashes --format=requirements.txt > requirements.txt
+```
+
+### Synching requirements.txt to poetry
+```bash
+cat requirements.txt | xargs poetry add
+```
diff --git a/ner/README.md b/ner/README.md
@@ -0,0 +1,17 @@
+# NER Playground
+
+## Running locally
+First input your different api keys in `env.sh`. Then:
+```bash
+source env.sh
+```
+
+### With Poetry
+First setup via:
+```bash
+poetry install --no-root
+```
+Now run via:
+```bash
+poetry run streamlit run ner_solution.py
+```
diff --git a/ner/env.sh.example b/ner/env.sh.example
@@ -0,0 +1,3 @@
+export LLAMA_CLOUD_API_KEY=
+export OCTOAI_API_KEY=
+export FIRECRAWL_API_KEY=
diff --git a/ner/ner_solution.py b/ner/ner_solution.py
@@ -6,36 +6,51 @@
 
 from code_editor import code_editor
 from llama_parse import LlamaParse
+from firecrawl import FirecrawlApp
 from openai import OpenAI
 from pathlib import Path
 
+
 def convert_to_json_schema(yaml_str):
     # Process yaml_dict
     yaml_dict = yaml.load(yaml_str, Loader=yaml.SafeLoader)
 
     # Prepare the return string
-    ret_str = "{\"properties\": {"
+    ret_str = '{"properties": {'
     for name, value in yaml_dict.items():
         description = ""
         if "desc" in value:
             description = value["desc"]
-        ret_str += "\"{}\": {{".format(name)
-        ret_str += "\"description\": \"{}\", ".format(description)
-        ret_str += "\"title\": \"{}\", ".format(name.replace("_", " ").title())
-        ret_str += "\"type\": \"string\"}, "
+        ret_str += '"{}": {{'.format(name)
+        ret_str += '"description": "{}", '.format(description)
+        ret_str += '"title": "{}", '.format(name.replace("_", " ").title())
+        ret_str += '"type": "string"}, '
     ret_str = ret_str[:-2]
-    ret_str += "}, \"required\": ["
+    ret_str += '}, "required": ['
     for name, value in yaml_dict.items():
-        ret_str += "\"{}\", ".format(name)
+        ret_str += '"{}", '.format(name)
     ret_str = ret_str[:-2]
-    ret_str += "], \"title\": \"JSONObject\", \"type\": \"object\"}"
+    ret_str += '], "title": "JSONObject", "type": "object"}'
 
     return ret_str
 
+
 st.set_page_config(layout="wide", page_title="NER Solution")
-st.write("## NER Solution")
+st.write("## NER Playground")
+
+if "octoai_api_key" not in st.session_state:
+    st.session_state["octoai_api_key"] = os.environ.get("OCTOAI_API_KEY", "")
+
+octoai_api_key = st.sidebar.text_input(
+    "OctoAI API Token [(get yours here)](https://octoai.cloud/n)",
+    type="password",
+    value=st.session_state.octoai_api_key,
+)
+
+
+#################################################
+# Section 1: Inputs
 
-octoai_api_key = st.sidebar.text_input("OctoAI API Token [(get yours here)](https://octoai.cloud/n)", type="password")
 
 pdf_file = st.sidebar.file_uploader("Upload your PDF file here", type=".pdf")
 
@@ -59,26 +74,70 @@ def convert_to_json_schema(yaml_str):
 parser = LlamaParse(
     # Get API key from https://github.com/run-llama/llama_parse
     api_key=os.environ["LLAMA_CLOUD_API_KEY"],
-    result_type="markdown"
+    result_type="markdown",
 )
 
+website_url = st.sidebar.text_input("Enter the URL of the website to scrape")
+
+web_parser = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
+
+#################################################
+# Section 2: Processing the inputs
+
 if pdf_file and octoai_api_key:
     # Preprocess PDF
     with st.status("Processing the PDFs into Markdown form..."):
         # Store to disk
         # FIXME - tmoreau: let's not do this in the final version
         fp = Path("./", pdf_file.name)
-        with open(fp, mode='wb') as w:
+        with open(fp, mode="wb") as w:
             w.write(pdf_file.read())
         # Read in first document
         documents = parser.load_data(Path("./", pdf_file.name))
         doc_str = ""
         for document in documents:
             doc_str += document.text
             doc_str += "\n"
+        st.session_state.doc_str = doc_str
+
+elif website_url:
+    with st.status("Processing the website into Markdown form..."):
+        # Crawl a website:
+        crawl_status = web_parser.crawl_url(
+            website_url,
+            params={
+                "limit": 5,
+                "scrapeOptions": {"formats": ["markdown"]},
+                "excludePaths": ["/blog", "/docs"],
+            },
+            wait_until_done=True,
+            poll_interval=20,
+        )
+        doc_str = ""
+        for page in crawl_status["data"]:
+            doc_str += f"# {page['metadata']['title']}\n"
+            doc_str += page["markdown"]
+            doc_str += "\n"
+
+        st.session_state.doc_str = doc_str
+
+
+#################################################
+# Section 3: Processing the outputs
+
+if "doc_str" in st.session_state.keys() and st.session_state.doc_str != "":
+    with st.expander(
+        f"See extracted markdown: {st.session_state.doc_str[:50]}", expanded=False
+    ):
+        tab1, tab2 = st.tabs(["Markdown", "Raw"])
+
+        with tab1:
+            st.markdown(st.session_state.doc_str)
+        with tab2:
+            st.code(st.session_state.doc_str, language="markdown")
 
     # Prepare the JSON schema
-    json_schema = convert_to_json_schema(code_response['text'])
+    json_schema = convert_to_json_schema(code_response["text"])
 
     # Let's do some LLM magic here
     with st.status("Converting to JSON form..."):
@@ -98,10 +157,10 @@ def convert_to_json_schema(yaml_str):
             "model": "meta-llama-3.1-70b-instruct",
             "messages": [
                 {"role": "system", "content": system_prompt.format(json_schema)},
-                {"role": "user", "content": doc_str}
+                {"role": "user", "content": doc_str},
             ],
             "temperature": 0,
-            "max_tokens": 131072
+            "max_tokens": 131072,
         }
         # Derive output values
         response = client.chat.completions.create(**data)