Skip to content

Commit

Permalink
ref: fix multimodal prompt type
Browse files Browse the repository at this point in the history
  • Loading branch information
kartik4949 committed Jan 31, 2025
1 parent f19e003 commit a7e1237
Show file tree
Hide file tree
Showing 8 changed files with 153 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ def main():
#############################################

recipe_generator = RecipeGenerator(
model_name="deepseek-ai/DeepSeek-R1",
backend="openai",
backend_params={"base_url": "https://api.kluster.ai/v1"}
model_name="deepinfra/meta-llama/Llama-2-70b-chat-hf",
backend="litellm",
)

# Generate recipes for all cuisines
Expand Down
60 changes: 60 additions & 0 deletions examples/multimodal/recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Generate synthetic recipes from ingrients image and cuisine using curator."""

from datasets import Dataset

from bespokelabs import curator


class RecipeGenerator(curator.LLM):
"""A recipe generator that generates recipes for different cuisines."""

def prompt(self, input: dict) -> str:
"""Generate a prompt using the template and cuisine."""
prompt = f"Create me a recipe for {input['cuisine']} cuisine and ingrients from the image."
return prompt, curator.types.Image(url=input["image_url"])

def parse(self, input: dict, response: str) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
return {
"recipe": response,
}


def main():
"""Generate synthetic recipes for different cuisines."""
# List of cuisines to generate recipes for
cuisines = [
{"cuisine": cuisine[0], "image_url": cuisine[1]}
for cuisine in [
("Indian", "https://cdn.tasteatlas.com//images/ingredients/fcee541cd2354ed8b68b50d1aa1acad8.jpeg"),
("Thai", "https://cdn.tasteatlas.com//images/dishes/da5fd425608f48b09555f5257a8d3a86.jpg"),
]
]
cuisines = Dataset.from_list(cuisines)

# Create prompter using LiteLLM backend
#############################################
# To use Gemini models:
# 1. Go to https://aistudio.google.com/app/apikey
# 2. Generate an API key
# 3. Set environment variable: GEMINI_API_KEY
# 4. If you are a free user, update rate limits:
# max_requests_per_minute=15
# max_tokens_per_minute=1_000_000
# (Up to 1,000 requests per day)
#############################################

recipe_generator = RecipeGenerator(
model_name="gpt-4o",
backend="openai",
)

# Generate recipes for all cuisines
recipes = recipe_generator(cuisines)

# Print results
print(recipes.to_pandas())


if __name__ == "__main__":
main()
87 changes: 51 additions & 36 deletions examples/poem-generation/poem.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,50 +11,65 @@
from bespokelabs import curator


import base64
from pydantic import BaseModel, Field
from bespokelabs import curator
# We use Pydantic and structured outputs to define the format of the response.
# This defines a list of topics, which is the response format for the topic generator.
class Topics(BaseModel):
"""A list of topics."""

topics_list: List[str] = Field(description="A list of topics.")


# We define a topic generator class that inherits from LLM
class TopicGenerator(curator.LLM):
"""A topic generator that generates diverse topics for poems."""

response_format = Topics

def prompt(self, input: dict) -> str:
"""Generate a prompt for the topic generator."""
return "Generate 10 diverse topics that are suitable for writing poems about."

def parse(self, input: dict, response: Topics) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
return [{"topic": t} for t in response.topics_list]


# We instantiate the topic generator and call it to generate topics
topic_generator = TopicGenerator(model_name="gpt-4o-mini")
topics: Dataset = topic_generator()
print(topics["topic"])

class MultiModalRecipe(BaseModel):
recipe: str
title: str
instructions: str
cook_time: str
ingredients: str

# Define a list of poems.
class Poems(BaseModel):
"""A list of poems."""

class MultiModalRecipeGenerator(curator.LLM):
"""A recipe generator that can handle multimodal inputs and outputs."""
poems_list: List[str] = Field(description="A list of poems.")

response_format = MultiModalRecipe

def prompt(self, input: dict) -> curator.MultiModalPrompt:
prompt = f"Generate a {input['cuisine']} recipe given ingredients in the image. Be creative but keep it realistic."
return prompt, curator.types.Image(url=input["ingredients_url"])
# We define a poet class that inherits from LLM
class Poet(curator.LLM):
"""A poet that generates poems about given topics."""

response_format = Poems

def parse(self, input: dict, response: MultiModalRecipe) -> dict:
result = {
"title": response.title,
"ingredients": response.ingredients,
"instructions": response.instructions,
"cook_time": response.cook_time,
"cuisine": input["cuisine"],
}
return result
def prompt(self, input: dict) -> str:
"""Generate a prompt using the topic."""
return f"Write two poems about {input['topic']}."

def parse(self, input: dict, response: Poems) -> dict:
"""Parse the model response along with the input to the model into the desired output format.."""
return [{"topic": input["topic"], "poem": p} for p in response.poems_list]

def main():
"""Example usage of multimodal recipe generation."""
recipe_generator = MultiModalRecipeGenerator(
model_name="gpt-4o",
backend="openai",
backend_params={"max_requests_per_minute": 2_000, "max_tokens_per_minute": 4_000_000},
)

recipe = recipe_generator({
"cuisine": "Italian",
"food_image": "path/to/pizza_reference.jpg"
})
# We instantiate the poet and apply it to the topics dataset
poet = Poet(model_name="gpt-4o-mini")
poems = poet(topics)
print(poems.to_pandas())

print(recipe.to_pandas())
# Expected output:
# topic poem
# 0 Dreams vs. reality In the realm where dreams take flight,\nWhere ...
# 1 Dreams vs. reality Reality stands with open eyes,\nA weighty thro...
# 2 Urban loneliness in a bustling city In the city's heart where shadows blend,\nAmon...
# 3 Urban loneliness in a bustling city Among the crowds, I walk alone,\nA sea of face...
2 changes: 1 addition & 1 deletion src/bespokelabs/curator/llm/prompt_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def create_generic_request(self, row: _DictOrBaseModel, idx: int) -> GenericRequ
_validate_messages(prompts)
messages = prompts
elif isinstance(prompts, tuple):
messages = _MultiModalPrompt.load(prompts)
messages = [{"role": "user", "content": _MultiModalPrompt.load(prompts)}]
else:
raise ValueError("The return value of the prompt_func must be a list of dictionaries.")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ async def process_requests_from_file(
async for line in file:
if self._semaphore:
await self._semaphore.acquire()

generic_request = GenericRequest.model_validate_json(line)

if generic_request.original_row_idx in completed_request_ids:
Expand Down
32 changes: 20 additions & 12 deletions src/bespokelabs/curator/request_processor/openai_request_mixin.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import base64
import logging
from typing import Any

import pydantic

from bespokelabs.curator.types.generic_request import GenericRequest
from bespokelabs.curator.types.prompt import _MultiModalPrompt

Expand Down Expand Up @@ -46,21 +48,27 @@ def create_api_specific_request_online(self, generic_request: GenericRequest) ->
return request

def _unpack(self, messages):
if isinstance(messages['content'], _MultiModalPrompt):
return self._handle_multi_modal_prompt(messages)
return messages

def _handle_multi_modal_prompt(self, messages):
unpacked_messages = []
for message in messages:
try:
content = _MultiModalPrompt.model_validate(message["content"])
content = self._handle_multi_modal_prompt(content)
message["content"] = content
unpacked_messages.append(message)

except pydantic.ValidationError:
unpacked_messages.append(message)
return unpacked_messages

def _handle_multi_modal_prompt(self, message):
content = []
texts = messages['content'].texts
texts = message.texts
for text in texts:
content.append({"type": "text", "text": text})
for image in messages['content'].images:
for image in message.images:
if image.url:
content.append({"type": "image_url", "image_url": {"url": image.url}})
elif image.content:
image_base64 = base64.b64encode(image.content).decode("utf-8")
image_base64 = base64.b64encode(image.content).decode("utf-8")
content.append({"type": "image_url", "image_url": {"url": image_base64}})

messages['content'] = content
return messages
return content
3 changes: 1 addition & 2 deletions src/bespokelabs/curator/types/generic_request.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Any, Dict, List

from pydantic import BaseModel, Field
from .prompt import _MultiModalPrompt

"""A generic request model for LLM API requests.
Expand All @@ -19,7 +18,7 @@ class GenericRequest(BaseModel):
"""A generic request model for LLM API requests."""

model: str
messages: List[Dict[str, Any]] | _MultiModalPrompt
messages: List[Dict[str, Any]]
response_format: Dict[str, Any] | None = None
original_row: Dict[str, Any]
original_row_idx: int
Expand Down
31 changes: 17 additions & 14 deletions src/bespokelabs/curator/types/prompt.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,51 @@
# Description: Pydantic models for multimodal prompts.
import typing as t

from pydantic import BaseModel, Field


class BaseType(BaseModel):
"""A class to represent the base type for multimodal prompts."""

type: str = Field(..., description="The type of the multimodal prompt.")
type: t.ClassVar[str] = Field(..., description="The type of the multimodal prompt.")


class Image(BaseModel):
class Image(BaseType):
"""A class to represent an image for multimodal prompts."""

url: str = Field(None, description="The URL of the image.")
content: str = Field(None, description="Base64-encoded image content.")
type = "image"
url: str = Field("", description="The URL of the image.")
content: str = Field("", description="Base64-encoded image content.")
type: t.ClassVar[str] = "image"

def __post_init__(self):
"""Post init."""
# assert url or content is provided
assert self.url or self.content, "Either 'url' or 'content' must be provided."


class File(BaseModel):
class File(BaseType):
"""A class to represent a file for multimodal prompts."""

url: str = Field(..., description="The URL of the file.")
type = "file"
type: t.ClassVar[str] = "file"


class _MultiModalPrompt(BaseType):
"""A class to represent a multimodal prompt."""

texts: str = Field(None, description="The text of the prompt.")
images: Image = Field(None, description="The image of the prompt.")
files: File = Field(None, description="The file of the prompt.")
texts: t.List[str] = Field(default_factory=list, description="The text of the prompt.")
images: t.List[Image] = Field(default_factory=list, description="The image of the prompt.")
files: t.List[File] = Field(default_factory=list, description="The file of the prompt.")

@classmethod
def load(cls, messages):
prompt = {}
prompt = {"texts": [], "images": [], "files": []}
for msg in messages:
if isinstance(msg, BaseType):
if msg.type == "image":
prompt["images"] = msg
prompt["images"].append(msg)
elif msg.type == "file":
prompt["files"] = msg
prompt["files"].append(msg)
else:
prompt["text"] = msg
prompt["texts"].append(msg)
return cls(**prompt)

0 comments on commit a7e1237

Please sign in to comment.