Skip to content

Commit

Permalink
Adding Web scrapper to NER
Browse files Browse the repository at this point in the history
  • Loading branch information
ptorru committed Aug 28, 2024
1 parent a1af100 commit 2b69270
Show file tree
Hide file tree
Showing 8 changed files with 2,769 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
**env.sh
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,15 @@
# octoai-solutions
# OctoAI Solutions
A collection of reference solutions built on top of OctoAI SaaS

## Project setup
Some of the solutions are implemented as poetry projects.

### Creating requirements.txt from poetry
```bash
poetry export --without-hashes --format=requirements.txt > requirements.txt
```

### Synching requirements.txt to poetry
```bash
cat requirements.txt | xargs poetry add
```
17 changes: 17 additions & 0 deletions ner/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# NER Playground

## Running locally
First input your different api keys in `env.sh`. Then:
```bash
source env.sh
```

### With Poetry
First setup via:
```bash
poetry install --no-root
```
Now run via:
```bash
poetry run streamlit run ner_solution.py
```
3 changes: 3 additions & 0 deletions ner/env.sh.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export LLAMA_CLOUD_API_KEY=
export OCTOAI_API_KEY=
export FIRECRAWL_API_KEY=
89 changes: 74 additions & 15 deletions ner/ner_solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,51 @@

from code_editor import code_editor
from llama_parse import LlamaParse
from firecrawl import FirecrawlApp
from openai import OpenAI
from pathlib import Path


def convert_to_json_schema(yaml_str):
# Process yaml_dict
yaml_dict = yaml.load(yaml_str, Loader=yaml.SafeLoader)

# Prepare the return string
ret_str = "{\"properties\": {"
ret_str = '{"properties": {'
for name, value in yaml_dict.items():
description = ""
if "desc" in value:
description = value["desc"]
ret_str += "\"{}\": {{".format(name)
ret_str += "\"description\": \"{}\", ".format(description)
ret_str += "\"title\": \"{}\", ".format(name.replace("_", " ").title())
ret_str += "\"type\": \"string\"}, "
ret_str += '"{}": {{'.format(name)
ret_str += '"description": "{}", '.format(description)
ret_str += '"title": "{}", '.format(name.replace("_", " ").title())
ret_str += '"type": "string"}, '
ret_str = ret_str[:-2]
ret_str += "}, \"required\": ["
ret_str += '}, "required": ['
for name, value in yaml_dict.items():
ret_str += "\"{}\", ".format(name)
ret_str += '"{}", '.format(name)
ret_str = ret_str[:-2]
ret_str += "], \"title\": \"JSONObject\", \"type\": \"object\"}"
ret_str += '], "title": "JSONObject", "type": "object"}'

return ret_str


st.set_page_config(layout="wide", page_title="NER Solution")
st.write("## NER Solution")
st.write("## NER Playground")

if "octoai_api_key" not in st.session_state:
st.session_state["octoai_api_key"] = os.environ.get("OCTOAI_API_KEY", "")

octoai_api_key = st.sidebar.text_input(
"OctoAI API Token [(get yours here)](https://octoai.cloud/n)",
type="password",
value=st.session_state.octoai_api_key,
)


#################################################
# Section 1: Inputs

octoai_api_key = st.sidebar.text_input("OctoAI API Token [(get yours here)](https://octoai.cloud/n)", type="password")

pdf_file = st.sidebar.file_uploader("Upload your PDF file here", type=".pdf")

Expand All @@ -59,26 +74,70 @@ def convert_to_json_schema(yaml_str):
parser = LlamaParse(
# Get API key from https://github.com/run-llama/llama_parse
api_key=os.environ["LLAMA_CLOUD_API_KEY"],
result_type="markdown"
result_type="markdown",
)

website_url = st.sidebar.text_input("Enter the URL of the website to scrape")

web_parser = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])

#################################################
# Section 2: Processing the inputs

if pdf_file and octoai_api_key:
# Preprocess PDF
with st.status("Processing the PDFs into Markdown form..."):
# Store to disk
# FIXME - tmoreau: let's not do this in the final version
fp = Path("./", pdf_file.name)
with open(fp, mode='wb') as w:
with open(fp, mode="wb") as w:
w.write(pdf_file.read())
# Read in first document
documents = parser.load_data(Path("./", pdf_file.name))
doc_str = ""
for document in documents:
doc_str += document.text
doc_str += "\n"
st.session_state.doc_str = doc_str

elif website_url:
with st.status("Processing the website into Markdown form..."):
# Crawl a website:
crawl_status = web_parser.crawl_url(
website_url,
params={
"limit": 5,
"scrapeOptions": {"formats": ["markdown"]},
"excludePaths": ["/blog", "/docs"],
},
wait_until_done=True,
poll_interval=20,
)
doc_str = ""
for page in crawl_status["data"]:
doc_str += f"# {page['metadata']['title']}\n"
doc_str += page["markdown"]
doc_str += "\n"

st.session_state.doc_str = doc_str


#################################################
# Section 3: Processing the outputs

if "doc_str" in st.session_state.keys() and st.session_state.doc_str != "":
with st.expander(
f"See extracted markdown: {st.session_state.doc_str[:50]}", expanded=False
):
tab1, tab2 = st.tabs(["Markdown", "Raw"])

with tab1:
st.markdown(st.session_state.doc_str)
with tab2:
st.code(st.session_state.doc_str, language="markdown")

# Prepare the JSON schema
json_schema = convert_to_json_schema(code_response['text'])
json_schema = convert_to_json_schema(code_response["text"])

# Let's do some LLM magic here
with st.status("Converting to JSON form..."):
Expand All @@ -98,10 +157,10 @@ def convert_to_json_schema(yaml_str):
"model": "meta-llama-3.1-70b-instruct",
"messages": [
{"role": "system", "content": system_prompt.format(json_schema)},
{"role": "user", "content": doc_str}
{"role": "user", "content": doc_str},
],
"temperature": 0,
"max_tokens": 131072
"max_tokens": 131072,
}
# Derive output values
response = client.chat.completions.create(**data)
Expand Down
Loading

0 comments on commit 2b69270

Please sign in to comment.