Open-Model-Initiative · CheesyLaZanya · Dec 5, 2024 · Nov 26, 2024 · Nov 26, 2024 · Dec 5, 2024
diff --git a/modules/odr_caption/Taskfile.caption.yml b/modules/odr_caption/Taskfile.caption.yml
@@ -16,10 +16,21 @@ tasks:
   build:
     cmds:
       - docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml build
+  rebuild:
+    cmds:
+      - docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up --build
 
   run:
     cmds:
       - docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up
+
+  qwen7b:
+    cmds:
+      - ODR_VISION_MODEL=unsloth/Qwen2-VL-7B-Instruct docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up
+
+  qwen2b:
+    cmds:
+      - ODR_VISION_MODEL=Qwen/Qwen2-VL-2B-Instruct docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up
   watch:
     cmds:
       - docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml watch

diff --git a/modules/odr_caption/docker/Dockerfile.caption b/modules/odr_caption/docker/Dockerfile.caption
@@ -1,31 +1,30 @@
 FROM vllm/vllm-openai:v0.6.4.post1
 
-# Create a non-root user
-RUN groupadd -r appuser && useradd -r -g appuser appuser
+# Create a non-root user and set up directories
+RUN useradd -m -s /bin/bash appuser \
+    && mkdir -p /cache/HF_HOME  /cache/local_cache /app /vllm-workspace /cache/HF_HOME/.hub /cache/HF_HOME/hub/.locks \
+    && chmod -R 777 /cache  \
+    && chown -R appuser:appuser /cache /app /vllm-workspace \
+    && chmod -R 775 /app /vllm-workspace
 
 ENV VLLM_VERSION=v0.6.4.post1
 ENV DO_NOT_TRACK=1
+ENV HF_HOME=/cache/HF_HOME
 
-# Install dependencies
+# Install dependencies with read-only permissions
 COPY ./requirements.txt /app/requirements.txt
 
-# Copy application code
-COPY . /app
+# Copy application code with read-only permissions
+COPY  . /app
 RUN pip3 install --no-cache-dir -e /app \
-    && pip3 install git+https://github.com/dottxt-ai/outlines --upgrade \
-    && chown -R appuser:appuser /app \
-    && mkdir -p /vllm-workspace \
-    && chown -R appuser:appuser /vllm-workspace
+    && pip3 install git+https://github.com/dottxt-ai/outlines --upgrade
 
-# Set Python path to include the modules
-ENV PYTHONPATH=/app:/app/modules:$PYTHONPATH
+# Copy and set permissions for entrypoint script (executable but not writable)
+COPY --chmod=555 --chown=appuser:appuser endpoints-entrypoint.sh entrypoint.sh
 
 # Expose the port the app runs on
 EXPOSE 32100
 
-COPY --chmod=775 endpoints-entrypoint.sh entrypoint.sh
-RUN chown appuser:appuser entrypoint.sh
-
 # Switch to non-root user
 USER appuser
 

diff --git a/modules/odr_caption/docker/caption.docker-compose.yml b/modules/odr_caption/docker/caption.docker-compose.yml
@@ -5,6 +5,7 @@ services:
     build:
       context: ..
       dockerfile: ./docker/Dockerfile.caption
+    user: "${UID:-1000}:${GID:-1000}"
     ports:
       - 32100:32100
     deploy:
@@ -15,10 +16,12 @@ services:
               count: all
               capabilities: [gpu]
     volumes:
-      - ${HF_HOME:-~/.cache}:/models
-      - ../local/cache:/root/.cache/
+      - ${HF_HOME:-~/.cache}:/cache/HF_HOME
+      - ../local/cache:/cache/local_cache
     environment:
-      - HF_HOME=/models
+      - HF_HOME=/cache/HF_HOME
+      - ODR_TEXT_MODEL=${ODR_TEXT_MODEL:-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8}
+      - ODR_VISION_MODEL=${ODR_VISION_MODEL:-mistral-community/pixtral-12b}
     develop:
       watch:
         - action: sync

diff --git a/modules/odr_caption/odr_caption/models/get_model.py b/modules/odr_caption/odr_caption/models/get_model.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+import outlines
+import os
+
+
+def get_text_model(context=8096):
+    model_name = os.getenv("ODR_TEXT_MODEL", "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8")
+    llm = LLM(
+        model=model_name,
+        max_model_len=8096,
+        dtype="bfloat16",
+        gpu_memory_utilization=0.9,
+        trust_remote_code=True,
+    )
+
+    model = outlines.models.VLLM(llm)
+    return model
+
+
+def get_default_params() -> SamplingParams:
+    return SamplingParams(
+        temperature=0.7,
+        min_p=0.9,
+        max_tokens=4096,
+    )
diff --git a/modules/odr_caption/odr_caption/models/get_vision_model.py b/modules/odr_caption/odr_caption/models/get_vision_model.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from transformers import (
+    LlavaForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
+    PreTrainedModel,
+    AutoProcessor,
+)
+import outlines
+import os
+from outlines.models.transformers_vision import TransformersVision
+from typing import Type
+from odr_caption.utils.logger import logger
+
+
+class VisionModel:
+    def __init__(self, model_name: str, model_class: Type[PreTrainedModel]):
+        self.model_name = model_name
+        self.model_class = model_class
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = self.setup_model()
+
+    def setup_model(self):
+        if "AWQ" in self.model_name:
+            torch_dtype = torch.float16
+        else:
+            torch_dtype = torch.bfloat16
+        model_kwargs = {
+            "torch_dtype": torch_dtype,
+            "device_map": "auto",
+        }
+        processor_kwargs = {
+            "device": "cuda",
+        }
+        model = outlines.models.transformers_vision(
+            self.model_name,
+            model_class=self.model_class,
+            model_kwargs=model_kwargs,
+            processor_kwargs=processor_kwargs,
+        )
+        return model
+
+    def format_instruction(self, instruction: str, images: list):
+        logger.info(f"Formatting instruction: {instruction}")
+        logger.info(f"Images: {images}")
+        return self.processor.apply_chat_template(
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": instruction}]
+                    + [{"type": "image", "image": ""} for image in images],
+                }
+            ],
+            add_generation_prompt=True,
+        )
+
+
+class PixtralVisionModel(VisionModel):
+    img_token = "[IMG]"
+    pixtral_format = """
+        <s>[INST]
+        {instruction}
+        \n{img_tokens}[/INST]
+        """
+
+    def __init__(
+        self,
+        model_name: str = "unsloth/Pixtral-12B-2409",
+        model_class: Type[PreTrainedModel] = LlavaForConditionalGeneration,
+    ):
+        super().__init__(model_name, model_class)
+
+    def format_instruction(self, instruction: str, images: list):
+        img_tokens = [self.img_token for _ in images]
+        return self.pixtral_format.format(
+            instruction=instruction, img_tokens=img_tokens
+        )
+
+
+def get_vision_model() -> tuple[TransformersVision, AutoProcessor]:
+    model_name = os.getenv("ODR_VISION_MODEL", "unsloth/Pixtral-12B-2409")
+    logger.info(f"Using model: {model_name}")
+    # Select appropriate model class based on model name
+    if "Qwen" in model_name:
+        logger.info("Using Qwen2VLForConditionalGeneration")
+        model_class = Qwen2VLForConditionalGeneration
+        model = VisionModel(model_name, model_class)
+    elif "Llama" in model_name:
+        logger.info("Using MllamaForConditionalGeneration")
+        logger.error("MllamaForConditionalGeneration is not yet supported")
+        raise NotImplementedError("MllamaForConditionalGeneration is not yet supported")
+    else:
+        logger.info("Using LlavaForConditionalGeneration")
+        model_class = LlavaForConditionalGeneration
+        model = PixtralVisionModel(model_name, model_class)
+
+    return model
diff --git a/modules/odr_caption/odr_caption/outlines/StructuredCaption.py b/modules/odr_caption/odr_caption/outlines/StructuredCaption.py
@@ -5,35 +5,32 @@
 import outlines.samplers
 from odr_caption.utils.logger import logger
 
-from odr_caption.outlines.get_model import (
+from odr_caption.models.get_vision_model import (
     VisionModel,
     get_vision_model,
-    default_vision_model,
 )
-from odr_caption.outlines.caption import pixtral_instruction
-from outlines.generate.api import SamplingParameters
+from odr_caption.outlines.caption import instruction
 from odr_caption.schemas.caption import ImageData
 
 
 class StructuredCaption:
-    def __init__(self, model_config: VisionModel = default_vision_model):
+    def __init__(self):
         start_total = time.time()
-        self.model_config = model_config
         logger.info(
-            f"Initializing StructuredCaption with model: {model_config.model_name}"
+            "Initializing StructuredCaption"
         )
 
         # Time model loading
         start_model = time.time()
-        self.model = get_vision_model(model_config)
+        self.model: VisionModel = get_vision_model()
         model_time = time.time() - start_model
         logger.info(f"Model initialization completed in {model_time:.2f} seconds")
 
         # Time decoder generation
         start_decoder = time.time()
         sampler = outlines.samplers.multinomial(temperature=0.5)
         logger.info("Generating decoder...")
-        self.generator = outlines.generate.json(self.model, ImageData, sampler=sampler)
+        self.generator = outlines.generate.json(self.model.model, ImageData, sampler=sampler)
         decoder_time = time.time() - start_decoder
         logger.info(f"Decoder generation completed in {decoder_time:.2f} seconds")
 
@@ -42,10 +39,12 @@ def __init__(self, model_config: VisionModel = default_vision_model):
         logger.debug("StructuredCaption initialized successfully")
 
     def __call__(
-        self, image: Image.Image, instruction: str = pixtral_instruction, **kwargs
+        self, image: Image.Image, instruction: str = instruction, **kwargs
     ) -> ImageData:
         logger.info("Generating caption for image")
 
+        instruction = self.model.format_instruction(instruction, [image])
+        logger.debug(f"Instruction: {instruction}")
         try:
             result = self.generator(
                 instruction,

diff --git a/modules/odr_caption/odr_caption/outlines/caption.py b/modules/odr_caption/odr_caption/outlines/caption.py
@@ -8,6 +8,7 @@
 import os
 from outlines.generate.api import GenerationParameters, SamplingParameters
 from tqdm import tqdm
+from transformers import AutoProcessor
 
 
 class CustomEncoder(json.JSONEncoder):
@@ -18,31 +19,7 @@ def default(self, obj):
         return super().default(obj)
 
 
-def process_images_and_save(
-    images: List[Image.Image], instruction: str, output_file: str, generator
-):
-    results = []
-
-    for filename, image in images.items():
-        result = generator(instruction, [image])
-        result_dict = {filename: result.model_dump()}
-        results.append(result_dict)
-
-        os.makedirs(Path(output_file).parent, exist_ok=True)
-        # Save the current results to the JSON file
-        with open(output_file, "w", encoding="utf-8") as f:
-            json.dump(results, f, indent=2, ensure_ascii=False, cls=CustomEncoder)
-
-        print(f"Processed image {filename} and saved results")
-        print(result_dict)
-
-    print(f"All results saved to {output_file}")
-    return results
-
-
-pixtral_instruction = """
-<s>[INST]
-<Task>You are a structured image analysis agent. Generate comprehensive tag list, caption, and dense caption for an image classification system.</Task>
+instruction = """<Task>You are a structured image analysis agent. Generate comprehensive tag list, caption, and dense caption for an image classification system.</Task>
 <TagCategories requirement="You should generate a minimum of 1 tag for each category." confidence="Confidence score for the tag, between 0 (exclusive) and 1 (inclusive).">
 - Entity : The content of the image, including the objects, people, and other elements.
 - Relationship : The relationships between the entities in the image.
@@ -76,5 +53,4 @@ def process_images_and_save(
 </TagCategories>
 <ShortCaption note="The short caption is a concise single sentence caption of the image content with a maximum length of 100 characters.">
 <Verification note="The verification identifies issues with the extracted tags and simple caption where the tags do not match the visual content you can actually see. Be a critic.">
-<DenseCaption note="The dense caption is a descriptive but grounded narrative paragraph of the image content. Only reference items you are confident you can see in the image.It uses straightforward confident and clear language without overt flowery prose. It incorporates elements from each of the tag categories to provide a broad dense caption">\n[IMG][/INST]
-""".strip()
+<DenseCaption note="The dense caption is a descriptive but grounded narrative paragraph of the image content. Only reference items you are confident you can see in the image.It uses straightforward confident and clear language without overt flowery prose. It incorporates elements from each of the tag categories to provide a broad dense caption">"""
diff --git a/modules/odr_caption/odr_caption/outlines/get_model.py b/modules/odr_caption/odr_caption/outlines/get_model.py