Skip to content

Commit

Permalink
adding S3 connector + example of YAML files
Browse files Browse the repository at this point in the history
  • Loading branch information
tmoreau89 committed Sep 10, 2024
1 parent 57b7f3a commit cfd5b79
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 45 deletions.
172 changes: 127 additions & 45 deletions ner/ner_solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pandas as pd
import requests
import boto3

import streamlit as st
import tempfile
Expand Down Expand Up @@ -39,50 +40,42 @@ def convert_to_json_schema(yaml_str):
return ret_str


def transcribe_audio(file_path: str, octoai_token: str):
def transcribe_audio(encoded_audio: str, octoai_token: str):
"""
Takes the file path of an audio file and transcribes it to text.
Returns a string with the transcribed text.
"""
with open(file_path, "rb") as f:
encoded_audio = str(base64.b64encode(f.read()), "utf-8")
reply = requests.post(
"https://whisper2-or1pkb9b656p.octoai.run/predict",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {octoai_token}",
},
json={"audio": encoded_audio},
timeout=300,
)
try:
transcript = reply.json()["transcription"]
except Exception as e:
print(e)
print(reply.text)
raise ValueError("The transcription could not be completed.")
reply = requests.post(
"https://whisper2-or1pkb9b656p.octoai.run/predict",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {octoai_token}",
},
json={"audio": encoded_audio},
timeout=300,
)
try:
transcript = reply.json()["transcription"]
except Exception as e:
print(e)
print(reply.text)
raise ValueError("The transcription could not be completed.")

return transcript


def file_to_base64(file_path):
with open(file_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")


def process_image(file_path: str, octoai_token: str):
# Convert the images to base64 strings
base64_str = f"data:image/png;base64,{file_to_base64(file_path)}"
def process_image(encoded_image: str, octoai_token: str, yaml: str):
print(yaml)
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe what you see in the image in great detail",
"text": "Describe what you see in the image in great detail. Be as exhaustive and factual as possible. Provide detail according to the JSON description below:\n{}".format(yaml),
},
{"type": "image_url", "image_url": {"url": base64_str}},
{"type": "image_url", "image_url": {"url": encoded_image}},
],
}
]
Expand Down Expand Up @@ -139,7 +132,7 @@ def submit_new_token():
st.session_state.octoai_api_key = st.session_state.token_text_input


st.set_page_config(layout="wide", page_title="NER Playground")
st.set_page_config(layout="wide", page_title="Multi-Modal Data Extractor")

if "octoai_api_key" not in st.session_state:
st.session_state.octoai_api_key = os.environ.get("OCTOAI_API_KEY", None)
Expand All @@ -160,24 +153,41 @@ def submit_new_token():
"""
)
else:
with st.form("input-form", clear_on_submit=True, border=True):
tab1, tab2 = st.tabs(["Files", "URLs"])
with st.form("input-form", clear_on_submit=False, border=True):
tab1, tab2, tab3 = st.tabs(["Local Files", "URLs", "S3"])

# Local files
with tab1:
upload_files = st.file_uploader(
"Upload your files here",
"Upload your PDFs/audio/JPEG files here",
type=[".pdf", ".mp3", ".mp4", ".wav", ".jpg", ".jpeg"],
accept_multiple_files=True,
key="upload_files",
)
st.caption("Click on submit after uploading to process the files.")

# URLs
with tab2:
website_url = st.text_input(
"Enter the URL of the website to scrape", key="website_url"
"Enter the URL(s) of the website to scrape", key="website_url"
)
st.caption("Use comma for multiple URLs.")

# S3
with tab3:
aws_access_key_id = st.text_input(
"AWS Access Key ID", value="AWSACCESSKEYID"
)
aws_secret_access_key = st.text_input(
"AWS Secret Key", type="password", value="asdf"
)
aws_s3_bucket = st.text_input(
"AWS S3 bucket", value="bucket-name"
)
aws_s3_bucket_path = st.text_input(
"Path to directory to process", value="path/to/dir/"
)

st.form_submit_button("Submit", on_click=submit_onclick)

st.write(
Expand All @@ -187,8 +197,8 @@ def submit_new_token():
"[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/octoml/octoai-solutions)"
)

st.write("## NER Playground")
st.caption("Named Entity Recognition Playground.")
st.write("## Multi-Modal Data Extractor")
st.caption("Powered by OctoAI.")

#################################################
# Section 1: Inputs
Expand All @@ -207,12 +217,14 @@ def submit_new_token():
executive_summary:
desc: executive summary of the document
"""
st.session_state["yaml_format"] = yaml_format


def update_json_schema(code):
# Prepare the JSON schema
json_schema = convert_to_json_schema(code)
st.session_state["json_schema"] = json_schema
st.session_state["yaml_format"] = code


if "json_schema" not in st.session_state:
Expand All @@ -239,7 +251,6 @@ def update_json_schema(code):
]
code_response = code_editor(code=yaml_format, lang="yaml", buttons=custom_btns)
if code_response["text"]:
print(code_response["text"])
update_json_schema(code_response["text"])

if not st.session_state.get("process_new_inputs", False) and (
Expand Down Expand Up @@ -291,15 +302,26 @@ def update_json_schema(code):
or upload_file.name.endswith(".mp4")
or upload_file.name.endswith(".wav")
):
doc_str = transcribe_audio(
tf.name, st.session_state.octoai_api_key
)
elif upload_file.name.endswith("jpg") or upload_file.name.endswith(
"jpeg"
# Convert the image to base64 string
with open(tf.name, "rb") as f:
encoded_audio = str(base64.b64encode(f.read()), "utf-8")
doc_str = transcribe_audio(
encoded_audio, st.session_state.octoai_api_key
)
# Image file handling
elif (
upload_file.name.endswith("jpg")
or upload_file.name.endswith("jpeg")
):
doc_str = process_image(
tf.name, st.session_state.octoai_api_key
)
# Convert the images to base64 string
with open(tf.name, "rb") as f:
encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
encoded_image = f"data:image/png;base64,{encoded_image}"
doc_str = process_image(
encoded_image,
st.session_state.octoai_api_key,
str(yaml.load(st.session_state["yaml_format"], Loader=yaml.SafeLoader))
)
st.session_state.doc_str.append(doc_str)

elif website_url:
Expand Down Expand Up @@ -345,6 +367,66 @@ def update_json_schema(code):
f"An error occurred while processing {got_error}. Please refresh and try again."
)

elif aws_access_key_id and aws_secret_access_key:

# Create an S3 client
s3_client = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key
)
# Get the list in the bucket directory
result = s3_client.list_objects(
Bucket=aws_s3_bucket,
Prefix=aws_s3_bucket_path,
Delimiter='/'
)

if len(result.get('Contents')) == 1:
spinner_message = f"Processing {result.get('Contents')[0].get('Key')} into Markdown..."
else:
spinner_message = f"Processing {len(result.get('Contents'))-1} files into Markdown..."
# Preprocess documents
with st.status(spinner_message):
for bucket_file in result.get('Contents'):
f_name = bucket_file.get('Key')
data = s3_client.get_object(Bucket=aws_s3_bucket, Key=f_name)
if f_name == bucket_file:
continue
# PDF handling
if f_name.endswith(".pdf"):
# Read in first document
documents = parser.load_data(
data['Body'].read(),
extra_info={"file_name": f_name}
)
doc_str = ""
for document in documents:
doc_str += document.text
doc_str += "\n"
st.session_state.doc_str.append(doc_str)
# Audio file handling
elif (
f_name.endswith(".mp3")
or f_name.endswith(".mp4")
or f_name.endswith(".wav")
):
encoded_audio = str(base64.b64encode(data['Body'].read()), "utf-8")
doc_str = transcribe_audio(
encoded_audio, st.session_state.octoai_api_key
)
st.session_state.doc_str.append(doc_str)
elif f_name.endswith("jpg") or f_name.endswith("jpeg"):
# Convert the images to base64 string
encoded_image = base64.b64encode(data['Body'].read()).decode("utf-8")
encoded_image = f"data:image/png;base64,{encoded_image}"
doc_str = process_image(
encoded_image,
st.session_state.octoai_api_key,
str(yaml.load(st.session_state["yaml_format"], Loader=yaml.SafeLoader))
)
st.session_state.doc_str.append(doc_str)


#################################################
# Section 3: Processing the outputs
Expand Down Expand Up @@ -378,7 +460,7 @@ def update_json_schema(code):
"""

data = {
"model": "meta-llama-3.1-70b-instruct",
"model": "meta-llama-3.1-405b-instruct",
"messages": [
{
"role": "system",
Expand Down
1 change: 1 addition & 0 deletions ner/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ llama-parse
streamlit-code-editor
firecrawl-py
snowflake-connector-python[pandas]
boto3
9 changes: 9 additions & 0 deletions ner/yaml_examples/electronic_health_records.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Describe the fields of information in YAML format
executive_summary:
desc: one sentence executive summary of the transcript
symptoms_list:
desc: comma separated list of symptoms mentioned in the transcript
medication_list:
desc: comma separated list of medication mentioned in the transcript
procedures_list:
desc: comma separated list of procedures mentioned in the transcript
13 changes: 13 additions & 0 deletions ner/yaml_examples/expense_management.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Describe the fields of information in YAML format
merchant_name:
desc: name of merchant
date:
desc: date on the receipt
type:
desc: type of expense (e.g. food, transportation, lodging...)
list_of_items:
desc: comma separated list of items on the bill
total:
desc: bill total in dollars
includes_alcohol:
desc: yes if the bill includes alcohol, else no
11 changes: 11 additions & 0 deletions ner/yaml_examples/financial_records.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Describe the fields of information in YAML format
product_revenue:
desc: product revenue for the quarter
product_revenue_year_over_year_growth:
desc: product revenue year over year growth for the quarter
customers_over_one_mil:
desc: number of customer with trailing 12-month product revenue greater than 1 million
investor_contact:
desc: name of investor contact
investor_contact_email:
desc: email address of investor contact
7 changes: 7 additions & 0 deletions ner/yaml_examples/vegetation_management.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Describe the fields of information in YAML format
power_delivery_line:
desc: yes if power delivery line is present in image, else no
foliage:
desc: yes if foliage is present in image, else no
foliage_close_to_power_delivery_line:
desc: yes if foliage is in contact with, or too close to power delivery line in image, else no

0 comments on commit cfd5b79

Please sign in to comment.