Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/converse support images #211

Merged
merged 11 commits into from
Mar 3, 2025
Merged
95 changes: 70 additions & 25 deletions examples/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ def run_provider(provider, model, api_key=None, **kwargs):
latencies = {}

print("\nAsync Non-Stream")
chat_request = build_chat_request(model, chat_input="Hello, my name is Jason Json", is_stream=False)
chat_request = build_chat_request(model, chat_input="Hello, my name is Jason", is_stream=False)
string = """
What is Lorem Ipsum? json
What is Lorem Ipsum?
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.

Why do we use it?
Expand All @@ -27,7 +27,7 @@ def run_provider(provider, model, api_key=None, **kwargs):
Where does it come from?
Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.

What is Lorem Ipsum? json
What is Lorem Ipsum?
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.

Why do we use it?
Expand All @@ -37,7 +37,7 @@ def run_provider(provider, model, api_key=None, **kwargs):
Where does it come from?
Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.

What is Lorem Ipsum? json
What is Lorem Ipsum?
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.

Why do we use it?
Expand All @@ -58,7 +58,7 @@ def run_provider(provider, model, api_key=None, **kwargs):

print("\nAsync Stream")
async def async_stream():
chat_request = build_chat_request(model, chat_input="Hello, my name is Tom Json", is_stream=True)
chat_request = build_chat_request(model, chat_input="Hello, my name is Tom", is_stream=True)

response_async = await llm.achat(**chat_request)
async for p in response_async:
Expand All @@ -74,15 +74,15 @@ async def async_stream():


print("\nSync Non-Stream")
chat_request = build_chat_request(model, chat_input="Hello, my name is Alice Json", is_stream=False)
chat_request = build_chat_request(model, chat_input="Hello, my name is Alice", is_stream=False)

response_sync = llm.chat(**chat_request)
pprint(response_sync)
latencies["sync (ms)"]= response_sync.metrics["latency_s"]*1000


print("\nSync Stream")
chat_request = build_chat_request(model, chat_input="Hello, my name is Mary Json", is_stream=True)
chat_request = build_chat_request(model, chat_input="Hello, my name is Mary", is_stream=True)

response_sync_stream = llm.chat(**chat_request)
for p in response_sync_stream:
Expand Down Expand Up @@ -126,7 +126,6 @@ def build_chat_request(model: str, chat_input: str, is_stream: bool, max_tokens:
"parameters": {
"temperature": 0,
"max_tokens": max_tokens,
"response_format": {"type": "json_object"},
"functions": None,
}
}
Expand All @@ -138,29 +137,75 @@ def multiple_provider_runs(provider:str, model:str, num_runs:int, api_key:str, *
latencies = run_provider(provider=provider, model=model, api_key=api_key, **kwargs)
pprint(latencies)


def run_chat_all_providers():
# OpenAI
multiple_provider_runs(provider="openai", model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
multiple_provider_runs(provider="openai", model="o3-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
#multiple_provider_runs(provider="openai", model="o1-preview", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)


# Azure
multiple_provider_runs(provider="azure", model="gpt-4o-mini", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="gpt-4o", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-mini", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-preview", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])

# OpenAI
multiple_provider_runs(provider="openai", model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
multiple_provider_runs(provider="openai", model="o3-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
#multiple_provider_runs(provider="openai", model="o1-preview", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)

#multiple_provider_runs(provider="anthropic", model="claude-3-opus-20240229", num_runs=1, api_key=os.environ["ANTHROPIC_API_KEY"])

# Azure
multiple_provider_runs(provider="azure", model="gpt-4o-mini", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="gpt-4o", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-mini", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-preview", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-preview", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-mini", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])


#multiple_provider_runs(provider="anthropic", model="claude-3-opus-20240229", num_runs=1, api_key=os.environ["ANTHROPIC_API_KEY"])
multiple_provider_runs(provider="vertexai", model="gemini-1.5-flash", num_runs=1, api_key=os.environ["GOOGLE_API_KEY"])

#multiple_provider_runs(provider="azure", model="o1-preview", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
#multiple_provider_runs(provider="azure", model="o1-mini", num_runs=1, api_key=os.environ["AZURE_API_KEY"], api_version=os.environ["AZURE_API_VERSION"], api_endpoint=os.environ["AZURE_API_ENDPOINT"])
# Bedrock
multiple_provider_runs(provider="bedrock", model="us.amazon.nova-lite-v1:0", num_runs=1, api_key=None, region=os.environ["BEDROCK_REGION"], secret_key=os.environ["BEDROCK_SECRET_KEY"], access_key=os.environ["BEDROCK_ACCESS_KEY"])
#multiple_provider_runs(provider="bedrock", model="anthropic.claude-3-5-sonnet-20241022-v2:0", num_runs=1, api_key=None, region=os.environ["BEDROCK_REGION"], secret_key=os.environ["BEDROCK_SECRET_KEY"], access_key=os.environ["BEDROCK_ACCESS_KEY"])

run_chat_all_providers()

multiple_provider_runs(provider="vertexai", model="gemini-1.5-flash", num_runs=1, api_key=os.environ["GOOGLE_API_KEY"])

# Bedrock
multiple_provider_runs(provider="bedrock", model="us.amazon.nova-lite-v1:0", num_runs=1, api_key=None, region=os.environ["BEDROCK_REGION"], secret_key=os.environ["BEDROCK_SECRET_KEY"], access_key=os.environ["BEDROCK_ACCESS_KEY"])
#multiple_provider_runs(provider="bedrock", model="anthropic.claude-3-5-sonnet-20241022-v2:0", num_runs=1, api_key=None, region=os.environ["BEDROCK_REGION"], secret_key=os.environ["BEDROCK_SECRET_KEY"], access_key=os.environ["BEDROCK_ACCESS_KEY"])
import base64

def messages(img_path):
"""
Creates a message payload with both text and image.
Adapts format based on the provider.
"""
with open(img_path, "rb") as f:
image_bytes = f.read()

base64_image = base64.b64encode(image_bytes).decode("utf-8")
return [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
{
"type": "image_url",
"image_url": {"url": "https://awsmp-logos.s3.amazonaws.com/seller-zx4pk43qpmxoa/53d235806f343cec94aac3c577d81c13.png"},
},
],
}
]

def run_send_imgs():
provider="bedrock"
model="us.amazon.nova-lite-v1:0"
chat_input=messages(img_path="./libs/llmstudio/tests/integration_tests/test_data/llmstudio-logo.jpeg")
chat_request = build_chat_request(model=model, chat_input=chat_input, is_stream=False)
llm = LLMCore(provider=provider, api_key=os.environ["OPENAI_API_KEY"], region=os.environ["BEDROCK_REGION"], secret_key=os.environ["BEDROCK_SECRET_KEY"], access_key=os.environ["BEDROCK_ACCESS_KEY"])
response_sync = llm.chat(**chat_request)
#print(response_sync)
response_sync.clean_print()

#for p in response_sync:
# if p.metrics:
# p.clean_print()

run_send_imgs()
87 changes: 87 additions & 0 deletions libs/core/llmstudio_core/providers/bedrock_converse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import base64
import json
import os
import re
import time
import uuid
from typing import (
Expand All @@ -14,6 +16,7 @@
)

import boto3
import requests
from llmstudio_core.exceptions import ProviderError
from llmstudio_core.providers.provider import ChatRequest, ProviderCore, provider
from llmstudio_core.utils import OpenAIToolFunction
Expand Down Expand Up @@ -276,6 +279,34 @@ def _process_messages(
}
)
messages.append(tool_use)
elif isinstance(message.get("content"), list):
converse_content_list = []
for content in message.get("content"):
converse_content = {}
if content.get("type") == "text":
converse_content["text"] = content.get("text")
elif content.get("type") == "image_url":
image_url = content.get("image_url")["url"]
bytes_image = BedrockConverseProvider._get_image_bytes(
image_url
)
format = (
BedrockConverseProvider._get_img_format_from_bytes(
bytes_image
)
)
converse_content["image"] = {
"format": format,
"source": {"bytes": bytes_image},
}
converse_content_list.append(converse_content)

messages.append(
{
"role": message.get("role"),
"content": converse_content_list,
}
)
else:
messages.append(
{
Expand Down Expand Up @@ -303,6 +334,62 @@ def _process_messages(

return messages, system_prompt

@staticmethod
def _base64_to_bytes(image_url: str) -> bytes:
"""
Extracts and decodes Base64 image data from a 'data:image/...;base64,...' URL.
Returns the raw image bytes.
"""
if not image_url.startswith("data:image/"):
raise ValueError("Invalid Base64 image URL")

base64_data = re.sub(r"^data:image/[^;]+;base64,", "", image_url)

return base64.b64decode(base64_data)

@staticmethod
def _get_img_format_from_bytes(image_bytes: bytes) -> str:
"""
Determines the image format from raw image bytes using file signatures (magic numbers).
"""
if image_bytes.startswith(b"\xFF\xD8\xFF"):
return "jpeg"
elif image_bytes.startswith(b"\x89PNG\r\n\x1A\n"):
return "png"
elif image_bytes.startswith(b"GIF87a") or image_bytes.startswith(b"GIF89a"):
return "gif"
elif (
image_bytes.startswith(b"\x52\x49\x46\x46") and image_bytes[8:12] == b"WEBP"
):
return "webp"
elif image_bytes.startswith(b"\x49\x49\x2A\x00") or image_bytes.startswith(
b"\x4D\x4D\x00\x2A"
):
return "tiff"
else:
raise ValueError("Unknown image format")

@staticmethod
def _get_image_bytes(image_url: str) -> bytes:
"""
Converts an image URL to a Base64-encoded string.
- If already in 'data:image/...;base64,...' format, it returns as-is.
- If it's a normal URL, downloads and encodes the image in Base64.
"""
if image_url.startswith("data:image/"):
return BedrockConverseProvider._base64_to_bytes(image_url)

elif image_url.startswith(("http://", "https://")):
response = requests.get(image_url)
if response.status_code != 200:
raise ValueError(f"Failed to download image: {response.status_code}")

image_bytes = response.content
return image_bytes

else:
raise ValueError("Invalid image URL format")

@staticmethod
def _process_tools(parameters: dict) -> Optional[Dict]:
if parameters.get("tools") is None and parameters.get("functions") is None:
Expand Down
Loading