-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
20516de
commit ab1ebc0
Showing
3 changed files
with
198 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
|
||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"[python]": { | ||
"editor.defaultFormatter": "ms-python.autopep8" | ||
}, | ||
"python.formatting.provider": "none" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urljoin, urlparse | ||
from dotenv import load_dotenv | ||
import requests | ||
import json | ||
import os | ||
import html2text | ||
from langchain.chat_models import ChatOpenAI | ||
from llama_index import Document | ||
from llama_index.node_parser import SimpleNodeParser | ||
from llama_index.text_splitter import TokenTextSplitter | ||
from langchain.prompts import ChatPromptTemplate | ||
from llama_index import VectorStoreIndex | ||
import openai | ||
|
||
load_dotenv() | ||
brwoserless_api_key = os.getenv("BROWSERLESS_API_KEY") | ||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
# 1. Scrape raw HTML | ||
|
||
def scrape_website(url: str): | ||
|
||
print("Scraping website...") | ||
# Define the headers for the request | ||
headers = { | ||
'Cache-Control': 'no-cache', | ||
'Content-Type': 'application/json', | ||
} | ||
|
||
# Define the data to be sent in the request | ||
data = { | ||
"url": url, | ||
"elements": [{ | ||
"selector": "body" | ||
}] | ||
} | ||
|
||
# Convert Python object to JSON string | ||
data_json = json.dumps(data) | ||
|
||
# Send the POST request | ||
response = requests.post( | ||
f"https://chrome.browserless.io/scrape?token={brwoserless_api_key}", | ||
headers=headers, | ||
data=data_json | ||
) | ||
|
||
# Check the response status code | ||
if response.status_code == 200: | ||
# Decode & Load the string as a JSON object | ||
result = response.content | ||
data_str = result.decode('utf-8') | ||
data_dict = json.loads(data_str) | ||
|
||
# Extract the HTML content from the dictionary | ||
html_string = data_dict['data'][0]['results'][0]['html'] | ||
|
||
return html_string | ||
else: | ||
print(f"HTTP request failed with status code {response.status_code}") | ||
|
||
|
||
# 2. Convert html to markdown | ||
|
||
def convert_html_to_markdown(html): | ||
|
||
# Create an html2text converter | ||
converter = html2text.HTML2Text() | ||
|
||
# Configure the converter | ||
converter.ignore_links = False | ||
|
||
# Convert the HTML to Markdown | ||
markdown = converter.handle(html) | ||
|
||
return markdown | ||
|
||
|
||
# Turn https://developers.webflow.com/docs/getting-started-with-apps to https://developers.webflow.com | ||
|
||
def get_base_url(url): | ||
parsed_url = urlparse(url) | ||
|
||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" | ||
return base_url | ||
|
||
|
||
# Turn relative url to absolute url in html | ||
|
||
def convert_to_absolute_url(html, base_url): | ||
soup = BeautifulSoup(html, 'html.parser') | ||
|
||
for img_tag in soup.find_all('img'): | ||
if img_tag.get('src'): | ||
src = img_tag.get('src') | ||
if src.startswith(('http://', 'https://')): | ||
continue | ||
absolute_url = urljoin(base_url, src) | ||
img_tag['src'] = absolute_url | ||
elif img_tag.get('data-src'): | ||
src = img_tag.get('data-src') | ||
if src.startswith(('http://', 'https://')): | ||
continue | ||
absolute_url = urljoin(base_url, src) | ||
img_tag['data-src'] = absolute_url | ||
|
||
for link_tag in soup.find_all('a'): | ||
href = link_tag.get('href') | ||
if href.startswith(('http://', 'https://')): | ||
continue | ||
absolute_url = urljoin(base_url, href) | ||
link_tag['href'] = absolute_url | ||
|
||
updated_html = str(soup) | ||
|
||
return updated_html | ||
|
||
|
||
def get_markdown_from_url(url): | ||
base_url = get_base_url(url) | ||
html = scrape_website(url) | ||
updated_html = convert_to_absolute_url(html, base_url) | ||
markdown = convert_html_to_markdown(updated_html) | ||
|
||
return markdown | ||
|
||
|
||
# 3. Create vector index from markdown | ||
|
||
def create_index_from_text(markdown): | ||
text_splitter = TokenTextSplitter( | ||
separator="\n", | ||
chunk_size=1024, | ||
chunk_overlap=20, | ||
backup_separators=["\n\n", ".", ","] | ||
) | ||
|
||
node_parser = SimpleNodeParser(text_splitter=text_splitter) | ||
nodes = node_parser.get_nodes_from_documents( | ||
[Document(text=markdown)], show_progress=True) | ||
|
||
# build index | ||
index = VectorStoreIndex(nodes) | ||
|
||
print("Index created!") | ||
return index | ||
|
||
|
||
# 4. Retrieval Augmented Generation (RAG) | ||
|
||
|
||
def generate_answer(query, index): | ||
|
||
# Get relevant data with similarity search | ||
retriever = index.as_retriever() | ||
nodes = retriever.retrieve(query) | ||
texts = [node.node.text for node in nodes] | ||
|
||
print("Retrieved texts!", texts) | ||
|
||
# Generate answer with OpenAI | ||
model = ChatOpenAI(model_name="gpt-3.5-turbo-16k-0613") | ||
template = """ | ||
CONTEXT: {docs} | ||
You are a helpful assistant, above is some context, | ||
Please answer the question, and make sure you follow ALL of the rules below: | ||
1. Answer the questions only based on context provided, do not make things up | ||
2. Answer questions in a helpful manner that straight to the point, with clear structure & all relevant information that might help users answer the question | ||
3. Anwser should be formatted in Markdown | ||
4. If there are relevant images, video, links, they are very important reference data, please include them as part of the answer | ||
QUESTION: {query} | ||
ANSWER (formatted in Markdown): | ||
""" | ||
prompt = ChatPromptTemplate.from_template(template) | ||
chain = prompt | model | ||
|
||
response = chain.invoke({"docs": texts, "query": query}) | ||
|
||
return response.content | ||
|
||
|
||
url = "https://developers.webflow.com/docs/getting-started-with-apps" | ||
query = "How to create a Webflow app?" | ||
markdown = get_markdown_from_url(url) | ||
index = create_index_from_text(markdown) | ||
answer = generate_answer(query, index) | ||
print(answer) |