ref: fix multimodal prompt type

bespokelabsai · Jan 31, 2025 · a7e1237 · a7e1237
1 parent f19e003
commit a7e1237
Show file tree

Hide file tree

Showing 8 changed files with 153 additions and 68 deletions.
diff --git a/examples/litellm-recipe-generation/litellm_recipe_prompting.py b/examples/litellm-recipe-generation/litellm_recipe_prompting.py
@@ -53,9 +53,8 @@ def main():
     #############################################
 
     recipe_generator = RecipeGenerator(
-        model_name="deepseek-ai/DeepSeek-R1",
-        backend="openai",
-        backend_params={"base_url": "https://api.kluster.ai/v1"}
+        model_name="deepinfra/meta-llama/Llama-2-70b-chat-hf",
+        backend="litellm",
     )
 
     # Generate recipes for all cuisines

diff --git a/examples/multimodal/recipe.py b/examples/multimodal/recipe.py
@@ -0,0 +1,60 @@
+"""Generate synthetic recipes from ingrients image and cuisine using curator."""
+
+from datasets import Dataset
+
+from bespokelabs import curator
+
+
+class RecipeGenerator(curator.LLM):
+    """A recipe generator that generates recipes for different cuisines."""
+
+    def prompt(self, input: dict) -> str:
+        """Generate a prompt using the template and cuisine."""
+        prompt = f"Create me a recipe for {input['cuisine']} cuisine and ingrients from the image."
+        return prompt, curator.types.Image(url=input["image_url"])
+
+    def parse(self, input: dict, response: str) -> dict:
+        """Parse the model response along with the input to the model into the desired output format.."""
+        return {
+            "recipe": response,
+        }
+
+
+def main():
+    """Generate synthetic recipes for different cuisines."""
+    # List of cuisines to generate recipes for
+    cuisines = [
+        {"cuisine": cuisine[0], "image_url": cuisine[1]}
+        for cuisine in [
+            ("Indian", "https://cdn.tasteatlas.com//images/ingredients/fcee541cd2354ed8b68b50d1aa1acad8.jpeg"),
+            ("Thai", "https://cdn.tasteatlas.com//images/dishes/da5fd425608f48b09555f5257a8d3a86.jpg"),
+        ]
+    ]
+    cuisines = Dataset.from_list(cuisines)
+
+    # Create prompter using LiteLLM backend
+    #############################################
+    # To use Gemini models:
+    # 1. Go to https://aistudio.google.com/app/apikey
+    # 2. Generate an API key
+    # 3. Set environment variable: GEMINI_API_KEY
+    # 4. If you are a free user, update rate limits:
+    #       max_requests_per_minute=15
+    #       max_tokens_per_minute=1_000_000
+    #       (Up to 1,000 requests per day)
+    #############################################
+
+    recipe_generator = RecipeGenerator(
+        model_name="gpt-4o",
+        backend="openai",
+    )
+
+    # Generate recipes for all cuisines
+    recipes = recipe_generator(cuisines)
+
+    # Print results
+    print(recipes.to_pandas())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/poem-generation/poem.py b/examples/poem-generation/poem.py
@@ -11,50 +11,65 @@
 from bespokelabs import curator
 
 
-import base64
-from pydantic import BaseModel, Field
-from bespokelabs import curator
+# We use Pydantic and structured outputs to define the format of the response.
+# This defines a list of topics, which is the response format for the topic generator.
+class Topics(BaseModel):
+    """A list of topics."""
+
+    topics_list: List[str] = Field(description="A list of topics.")
+
+
+# We define a topic generator class that inherits from LLM
+class TopicGenerator(curator.LLM):
+    """A topic generator that generates diverse topics for poems."""
+
+    response_format = Topics
+
+    def prompt(self, input: dict) -> str:
+        """Generate a prompt for the topic generator."""
+        return "Generate 10 diverse topics that are suitable for writing poems about."
+
+    def parse(self, input: dict, response: Topics) -> dict:
+        """Parse the model response along with the input to the model into the desired output format.."""
+        return [{"topic": t} for t in response.topics_list]
+
+
+# We instantiate the topic generator and call it to generate topics
+topic_generator = TopicGenerator(model_name="gpt-4o-mini")
+topics: Dataset = topic_generator()
+print(topics["topic"])
 
-class MultiModalRecipe(BaseModel):
-    recipe: str
-    title: str
-    instructions: str
-    cook_time: str
-    ingredients: str
 
+# Define a list of poems.
+class Poems(BaseModel):
+    """A list of poems."""
 
-class MultiModalRecipeGenerator(curator.LLM):
-    """A recipe generator that can handle multimodal inputs and outputs."""
+    poems_list: List[str] = Field(description="A list of poems.")
 
-    response_format = MultiModalRecipe
 
-    def prompt(self, input: dict) -> curator.MultiModalPrompt:
-        prompt = f"Generate a {input['cuisine']} recipe given ingredients in the image. Be creative but keep it realistic."
-        return prompt, curator.types.Image(url=input["ingredients_url"])
+# We define a poet class that inherits from LLM
+class Poet(curator.LLM):
+    """A poet that generates poems about given topics."""
 
+    response_format = Poems
 
-    def parse(self, input: dict, response: MultiModalRecipe) -> dict:
-        result = {
-            "title": response.title,
-            "ingredients": response.ingredients,
-            "instructions": response.instructions,
-            "cook_time": response.cook_time,
-            "cuisine": input["cuisine"],
-        }
-        return result
+    def prompt(self, input: dict) -> str:
+        """Generate a prompt using the topic."""
+        return f"Write two poems about {input['topic']}."
 
+    def parse(self, input: dict, response: Poems) -> dict:
+        """Parse the model response along with the input to the model into the desired output format.."""
+        return [{"topic": input["topic"], "poem": p} for p in response.poems_list]
 
-def main():
-    """Example usage of multimodal recipe generation."""
-    recipe_generator = MultiModalRecipeGenerator(
-        model_name="gpt-4o",
-        backend="openai",
-        backend_params={"max_requests_per_minute": 2_000, "max_tokens_per_minute": 4_000_000},
-    )
 
-    recipe = recipe_generator({
-        "cuisine": "Italian",
-        "food_image": "path/to/pizza_reference.jpg"
-    })
+# We instantiate the poet and apply it to the topics dataset
+poet = Poet(model_name="gpt-4o-mini")
+poems = poet(topics)
+print(poems.to_pandas())
 
-    print(recipe.to_pandas())
+# Expected output:
+#                                           topic                                               poem
+# 0                            Dreams vs. reality  In the realm where dreams take flight,\nWhere ...
+# 1                            Dreams vs. reality  Reality stands with open eyes,\nA weighty thro...
+# 2           Urban loneliness in a bustling city  In the city's heart where shadows blend,\nAmon...
+# 3           Urban loneliness in a bustling city  Among the crowds, I walk alone,\nA sea of face...
diff --git a/src/bespokelabs/curator/llm/prompt_formatter.py b/src/bespokelabs/curator/llm/prompt_formatter.py
@@ -86,7 +86,7 @@ def create_generic_request(self, row: _DictOrBaseModel, idx: int) -> GenericRequ
             _validate_messages(prompts)
             messages = prompts
         elif isinstance(prompts, tuple):
-            messages = _MultiModalPrompt.load(prompts)
+            messages = [{"role": "user", "content": _MultiModalPrompt.load(prompts)}]
         else:
             raise ValueError("The return value of the prompt_func must be a list of dictionaries.")
 

diff --git a/src/bespokelabs/curator/request_processor/online/base_online_request_processor.py b/src/bespokelabs/curator/request_processor/online/base_online_request_processor.py
@@ -262,6 +262,7 @@ async def process_requests_from_file(
                 async for line in file:
                     if self._semaphore:
                         await self._semaphore.acquire()
+
                     generic_request = GenericRequest.model_validate_json(line)
 
                     if generic_request.original_row_idx in completed_request_ids:

diff --git a/src/bespokelabs/curator/request_processor/openai_request_mixin.py b/src/bespokelabs/curator/request_processor/openai_request_mixin.py
@@ -1,7 +1,9 @@
-import logging
 import base64
+import logging
 from typing import Any
 
+import pydantic
+
 from bespokelabs.curator.types.generic_request import GenericRequest
 from bespokelabs.curator.types.prompt import _MultiModalPrompt
 
@@ -46,21 +48,27 @@ def create_api_specific_request_online(self, generic_request: GenericRequest) ->
         return request
 
     def _unpack(self, messages):
-        if isinstance(messages['content'], _MultiModalPrompt):
-            return self._handle_multi_modal_prompt(messages)
-        return messages
-
-    def _handle_multi_modal_prompt(self, messages):
+        unpacked_messages = []
+        for message in messages:
+            try:
+                content = _MultiModalPrompt.model_validate(message["content"])
+                content = self._handle_multi_modal_prompt(content)
+                message["content"] = content
+                unpacked_messages.append(message)
+
+            except pydantic.ValidationError:
+                unpacked_messages.append(message)
+        return unpacked_messages
+
+    def _handle_multi_modal_prompt(self, message):
         content = []
-        texts = messages['content'].texts
+        texts = message.texts
         for text in texts:
             content.append({"type": "text", "text": text})
-        for image in messages['content'].images:
+        for image in message.images:
             if image.url:
                 content.append({"type": "image_url", "image_url": {"url": image.url}})
             elif image.content:
-                image_base64 =  base64.b64encode(image.content).decode("utf-8")
+                image_base64 = base64.b64encode(image.content).decode("utf-8")
                 content.append({"type": "image_url", "image_url": {"url": image_base64}})
-
-        messages['content'] = content
-        return messages
+        return content
diff --git a/src/bespokelabs/curator/types/generic_request.py b/src/bespokelabs/curator/types/generic_request.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, List
 
 from pydantic import BaseModel, Field
-from .prompt import _MultiModalPrompt
 
 """A generic request model for LLM API requests.
 
@@ -19,7 +18,7 @@ class GenericRequest(BaseModel):
     """A generic request model for LLM API requests."""
 
     model: str
-    messages: List[Dict[str, Any]] | _MultiModalPrompt
+    messages: List[Dict[str, Any]]
     response_format: Dict[str, Any] | None = None
     original_row: Dict[str, Any]
     original_row_idx: int

diff --git a/src/bespokelabs/curator/types/prompt.py b/src/bespokelabs/curator/types/prompt.py
@@ -1,48 +1,51 @@
 # Description: Pydantic models for multimodal prompts.
+import typing as t
+
 from pydantic import BaseModel, Field
 
 
 class BaseType(BaseModel):
     """A class to represent the base type for multimodal prompts."""
 
-    type: str = Field(..., description="The type of the multimodal prompt.")
+    type: t.ClassVar[str] = Field(..., description="The type of the multimodal prompt.")
 
 
-class Image(BaseModel):
+class Image(BaseType):
     """A class to represent an image for multimodal prompts."""
 
-    url: str = Field(None, description="The URL of the image.")
-    content: str = Field(None, description="Base64-encoded image content.")
-    type = "image"
+    url: str = Field("", description="The URL of the image.")
+    content: str = Field("", description="Base64-encoded image content.")
+    type: t.ClassVar[str] = "image"
 
     def __post_init__(self):
+        """Post init."""
         # assert url or content is provided
         assert self.url or self.content, "Either 'url' or 'content' must be provided."
 
 
-class File(BaseModel):
+class File(BaseType):
     """A class to represent a file for multimodal prompts."""
 
     url: str = Field(..., description="The URL of the file.")
-    type = "file"
+    type: t.ClassVar[str] = "file"
 
 
 class _MultiModalPrompt(BaseType):
     """A class to represent a multimodal prompt."""
 
-    texts: str = Field(None, description="The text of the prompt.")
-    images: Image = Field(None, description="The image of the prompt.")
-    files: File = Field(None, description="The file of the prompt.")
+    texts: t.List[str] = Field(default_factory=list, description="The text of the prompt.")
+    images: t.List[Image] = Field(default_factory=list, description="The image of the prompt.")
+    files: t.List[File] = Field(default_factory=list, description="The file of the prompt.")
 
     @classmethod
     def load(cls, messages):
-        prompt = {}
+        prompt = {"texts": [], "images": [], "files": []}
         for msg in messages:
             if isinstance(msg, BaseType):
                 if msg.type == "image":
-                    prompt["images"] = msg
+                    prompt["images"].append(msg)
                 elif msg.type == "file":
-                    prompt["files"] = msg
+                    prompt["files"].append(msg)
             else:
-                prompt["text"] = msg
+                prompt["texts"].append(msg)
         return cls(**prompt)