Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Caption Container Optimizations & Qwen Support #118

Merged
merged 5 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions modules/odr_caption/Taskfile.caption.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,21 @@ tasks:
build:
cmds:
- docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml build
rebuild:
cmds:
- docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up --build

run:
cmds:
- docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up

qwen7b:
cmds:
- ODR_VISION_MODEL=unsloth/Qwen2-VL-7B-Instruct docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up

qwen2b:
cmds:
- ODR_VISION_MODEL=Qwen/Qwen2-VL-2B-Instruct docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml up
watch:
cmds:
- docker compose -f ./modules/odr_caption/docker/caption.docker-compose.yml watch
Expand Down
27 changes: 13 additions & 14 deletions modules/odr_caption/docker/Dockerfile.caption
Original file line number Diff line number Diff line change
@@ -1,31 +1,30 @@
FROM vllm/vllm-openai:v0.6.4.post1

# Create a non-root user
RUN groupadd -r appuser && useradd -r -g appuser appuser
# Create a non-root user and set up directories
RUN useradd -m -s /bin/bash appuser \
&& mkdir -p /cache/HF_HOME /cache/local_cache /app /vllm-workspace /cache/HF_HOME/.hub /cache/HF_HOME/hub/.locks \
&& chmod -R 777 /cache \
&& chown -R appuser:appuser /cache /app /vllm-workspace \
&& chmod -R 775 /app /vllm-workspace

ENV VLLM_VERSION=v0.6.4.post1
ENV DO_NOT_TRACK=1
ENV HF_HOME=/cache/HF_HOME

# Install dependencies
# Install dependencies with read-only permissions
COPY ./requirements.txt /app/requirements.txt

# Copy application code
COPY . /app
# Copy application code with read-only permissions
COPY . /app
RUN pip3 install --no-cache-dir -e /app \
&& pip3 install git+https://github.com/dottxt-ai/outlines --upgrade \
&& chown -R appuser:appuser /app \
&& mkdir -p /vllm-workspace \
&& chown -R appuser:appuser /vllm-workspace
&& pip3 install git+https://github.com/dottxt-ai/outlines --upgrade

# Set Python path to include the modules
ENV PYTHONPATH=/app:/app/modules:$PYTHONPATH
# Copy and set permissions for entrypoint script (executable but not writable)
COPY --chmod=555 --chown=appuser:appuser endpoints-entrypoint.sh entrypoint.sh

# Expose the port the app runs on
EXPOSE 32100

COPY --chmod=775 endpoints-entrypoint.sh entrypoint.sh
RUN chown appuser:appuser entrypoint.sh

# Switch to non-root user
USER appuser

Expand Down
9 changes: 6 additions & 3 deletions modules/odr_caption/docker/caption.docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ services:
build:
context: ..
dockerfile: ./docker/Dockerfile.caption
user: "${UID:-1000}:${GID:-1000}"
ports:
- 32100:32100
deploy:
Expand All @@ -15,10 +16,12 @@ services:
count: all
capabilities: [gpu]
volumes:
- ${HF_HOME:-~/.cache}:/models
- ../local/cache:/root/.cache/
- ${HF_HOME:-~/.cache}:/cache/HF_HOME
- ../local/cache:/cache/local_cache
environment:
- HF_HOME=/models
- HF_HOME=/cache/HF_HOME
- ODR_TEXT_MODEL=${ODR_TEXT_MODEL:-Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8}
- ODR_VISION_MODEL=${ODR_VISION_MODEL:-mistral-community/pixtral-12b}
develop:
watch:
- action: sync
Expand Down
27 changes: 27 additions & 0 deletions modules/odr_caption/odr_caption/models/get_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
from vllm.sampling_params import SamplingParams
import outlines
import os


def get_text_model(context=8096):
model_name = os.getenv("ODR_TEXT_MODEL", "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8")
llm = LLM(
model=model_name,
max_model_len=8096,
dtype="bfloat16",
gpu_memory_utilization=0.9,
trust_remote_code=True,
)

model = outlines.models.VLLM(llm)
return model


def get_default_params() -> SamplingParams:
return SamplingParams(
temperature=0.7,
min_p=0.9,
max_tokens=4096,
)
97 changes: 97 additions & 0 deletions modules/odr_caption/odr_caption/models/get_vision_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-License-Identifier: Apache-2.0
import torch
from transformers import (
LlavaForConditionalGeneration,
Qwen2VLForConditionalGeneration,
PreTrainedModel,
AutoProcessor,
)
import outlines
import os
from outlines.models.transformers_vision import TransformersVision
from typing import Type
from odr_caption.utils.logger import logger


class VisionModel:
def __init__(self, model_name: str, model_class: Type[PreTrainedModel]):
self.model_name = model_name
self.model_class = model_class
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = self.setup_model()

def setup_model(self):
if "AWQ" in self.model_name:
torch_dtype = torch.float16
else:
torch_dtype = torch.bfloat16
model_kwargs = {
"torch_dtype": torch_dtype,
"device_map": "auto",
}
processor_kwargs = {
"device": "cuda",
}
model = outlines.models.transformers_vision(
self.model_name,
model_class=self.model_class,
model_kwargs=model_kwargs,
processor_kwargs=processor_kwargs,
)
return model

def format_instruction(self, instruction: str, images: list):
logger.info(f"Formatting instruction: {instruction}")
logger.info(f"Images: {images}")
return self.processor.apply_chat_template(
[
{
"role": "user",
"content": [{"type": "text", "text": instruction}]
+ [{"type": "image", "image": ""} for image in images],
}
],
add_generation_prompt=True,
)


class PixtralVisionModel(VisionModel):
img_token = "[IMG]"
pixtral_format = """
<s>[INST]
{instruction}
\n{img_tokens}[/INST]
"""

def __init__(
self,
model_name: str = "unsloth/Pixtral-12B-2409",
model_class: Type[PreTrainedModel] = LlavaForConditionalGeneration,
):
super().__init__(model_name, model_class)

def format_instruction(self, instruction: str, images: list):
img_tokens = [self.img_token for _ in images]
return self.pixtral_format.format(
instruction=instruction, img_tokens=img_tokens
)


def get_vision_model() -> tuple[TransformersVision, AutoProcessor]:
model_name = os.getenv("ODR_VISION_MODEL", "unsloth/Pixtral-12B-2409")
logger.info(f"Using model: {model_name}")
# Select appropriate model class based on model name
if "Qwen" in model_name:
logger.info("Using Qwen2VLForConditionalGeneration")
model_class = Qwen2VLForConditionalGeneration
model = VisionModel(model_name, model_class)
elif "Llama" in model_name:
logger.info("Using MllamaForConditionalGeneration")
logger.error("MllamaForConditionalGeneration is not yet supported")
raise NotImplementedError("MllamaForConditionalGeneration is not yet supported")
else:
logger.info("Using LlavaForConditionalGeneration")
model_class = LlavaForConditionalGeneration
model = PixtralVisionModel(model_name, model_class)

return model
19 changes: 9 additions & 10 deletions modules/odr_caption/odr_caption/outlines/StructuredCaption.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,32 @@
import outlines.samplers
from odr_caption.utils.logger import logger

from odr_caption.outlines.get_model import (
from odr_caption.models.get_vision_model import (
VisionModel,
get_vision_model,
default_vision_model,
)
from odr_caption.outlines.caption import pixtral_instruction
from outlines.generate.api import SamplingParameters
from odr_caption.outlines.caption import instruction
from odr_caption.schemas.caption import ImageData


class StructuredCaption:
def __init__(self, model_config: VisionModel = default_vision_model):
def __init__(self):
start_total = time.time()
self.model_config = model_config
logger.info(
f"Initializing StructuredCaption with model: {model_config.model_name}"
"Initializing StructuredCaption"
)

# Time model loading
start_model = time.time()
self.model = get_vision_model(model_config)
self.model: VisionModel = get_vision_model()
model_time = time.time() - start_model
logger.info(f"Model initialization completed in {model_time:.2f} seconds")

# Time decoder generation
start_decoder = time.time()
sampler = outlines.samplers.multinomial(temperature=0.5)
logger.info("Generating decoder...")
self.generator = outlines.generate.json(self.model, ImageData, sampler=sampler)
self.generator = outlines.generate.json(self.model.model, ImageData, sampler=sampler)
decoder_time = time.time() - start_decoder
logger.info(f"Decoder generation completed in {decoder_time:.2f} seconds")

Expand All @@ -42,10 +39,12 @@ def __init__(self, model_config: VisionModel = default_vision_model):
logger.debug("StructuredCaption initialized successfully")

def __call__(
self, image: Image.Image, instruction: str = pixtral_instruction, **kwargs
self, image: Image.Image, instruction: str = instruction, **kwargs
) -> ImageData:
logger.info("Generating caption for image")

instruction = self.model.format_instruction(instruction, [image])
logger.debug(f"Instruction: {instruction}")
try:
result = self.generator(
instruction,
Expand Down
30 changes: 3 additions & 27 deletions modules/odr_caption/odr_caption/outlines/caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
from outlines.generate.api import GenerationParameters, SamplingParameters
from tqdm import tqdm
from transformers import AutoProcessor


class CustomEncoder(json.JSONEncoder):
Expand All @@ -18,31 +19,7 @@ def default(self, obj):
return super().default(obj)


def process_images_and_save(
images: List[Image.Image], instruction: str, output_file: str, generator
):
results = []

for filename, image in images.items():
result = generator(instruction, [image])
result_dict = {filename: result.model_dump()}
results.append(result_dict)

os.makedirs(Path(output_file).parent, exist_ok=True)
# Save the current results to the JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False, cls=CustomEncoder)

print(f"Processed image {filename} and saved results")
print(result_dict)

print(f"All results saved to {output_file}")
return results


pixtral_instruction = """
<s>[INST]
<Task>You are a structured image analysis agent. Generate comprehensive tag list, caption, and dense caption for an image classification system.</Task>
instruction = """<Task>You are a structured image analysis agent. Generate comprehensive tag list, caption, and dense caption for an image classification system.</Task>
<TagCategories requirement="You should generate a minimum of 1 tag for each category." confidence="Confidence score for the tag, between 0 (exclusive) and 1 (inclusive).">
- Entity : The content of the image, including the objects, people, and other elements.
- Relationship : The relationships between the entities in the image.
Expand Down Expand Up @@ -76,5 +53,4 @@ def process_images_and_save(
</TagCategories>
<ShortCaption note="The short caption is a concise single sentence caption of the image content with a maximum length of 100 characters.">
<Verification note="The verification identifies issues with the extracted tags and simple caption where the tags do not match the visual content you can actually see. Be a critic.">
<DenseCaption note="The dense caption is a descriptive but grounded narrative paragraph of the image content. Only reference items you are confident you can see in the image.It uses straightforward confident and clear language without overt flowery prose. It incorporates elements from each of the tag categories to provide a broad dense caption">\n[IMG][/INST]
""".strip()
<DenseCaption note="The dense caption is a descriptive but grounded narrative paragraph of the image content. Only reference items you are confident you can see in the image.It uses straightforward confident and clear language without overt flowery prose. It incorporates elements from each of the tag categories to provide a broad dense caption">"""
68 changes: 0 additions & 68 deletions modules/odr_caption/odr_caption/outlines/get_model.py

This file was deleted.

Loading
Loading