From f67440dbc680448a857f4e59bc5f9912f143dc2a Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Fri, 24 Jan 2025 11:59:08 +0000 Subject: [PATCH] [Fix] Fix Emu3 Inference --- vlmeval/vlm/emu.py | 61 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/vlmeval/vlm/emu.py b/vlmeval/vlm/emu.py index 7ba49cfa..10041f74 100644 --- a/vlmeval/vlm/emu.py +++ b/vlmeval/vlm/emu.py @@ -6,6 +6,7 @@ from .base import BaseModel from ..smp import * from huggingface_hub import snapshot_download +from PIL import Image, ImageOps def get_local_root(repo_id): @@ -19,6 +20,42 @@ def get_local_root(repo_id): return cache_path +def pad_image_to_aspect_ratio(img, max_aspect_ratio=5): + """ + Pad an image to ensure its aspect ratio (width/height or height/width) is less than the given value. + + Parameters: + img (PIL.Image): The input PIL Image object. + max_aspect_ratio (float): The maximum allowed aspect ratio. + + Returns: + PIL.Image: The padded image. + """ + width, height = img.size + # Calculate the required minimum dimensions to satisfy the aspect ratio constraint + if width > height * max_aspect_ratio: + # Width is too large, pad height + new_height = int(width / max_aspect_ratio + 1) + new_width = width + elif height > width * max_aspect_ratio: + # Height is too large, pad width + new_width = int(height / max_aspect_ratio + 1) + new_height = height + else: + # Aspect ratio is already less than or equal to max_aspect_ratio + return img + + # Calculate the padding amounts + pad_width = (new_width - width) + pad_height = (new_height - height) + + # Pad the image symmetrically + padding = (pad_width // 2, pad_height // 2, pad_width - pad_width // 2, pad_height - pad_height // 2) + padded_img = ImageOps.expand(img, padding, fill=(0, 0, 0)) # Fill color is black (0, 0, 0) by default + + return padded_img + + class Emu(BaseModel): INSTALL_REQ = False @@ -52,7 +89,6 @@ def __init__(self, model = AutoModelForCausalLM.from_pretrained( model_path, # "BAAI/Emu2-Chat" torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, trust_remote_code=True) device_map = infer_auto_device_map( @@ -103,7 +139,7 @@ def generate_inner(self, message, dataset=None): class Emu3_chat(BaseModel): - INSTALL_REQ = True + INSTALL_REQ = False INTERLEAVE = False def __init__(self, model_path='BAAI/Emu3-Chat', tokenizer_path='BAAI/Emu3-VisionTokenizer', **kwargs): @@ -132,19 +168,15 @@ def __init__(self, model_path='BAAI/Emu3-Chat', tokenizer_path='BAAI/Emu3-Vision tokenizer_path, device_map='cuda', trust_remote_code=True).eval() self.processor = Emu3Processor(self.image_processor, self.image_tokenizer, self.tokenizer) self.kwargs = kwargs - self.cuda = cuda def generate_inner(self, message, dataset=None): - query, images = '', [] - for item in message: - if item['type'] == 'image': - images.append(Image.open(item['value']).convert('RGB')) - elif item['type'] == 'text': - query += item['value'] + prompt, image = self.message_to_promptimg(message) + image = Image.open(image).convert('RGB') + image = pad_image_to_aspect_ratio(image, 5) inputs = self.processor( - text=[query], - image=images, + text=[prompt], + image=[image], mode='U', return_tensors="pt", padding="longest", @@ -159,9 +191,9 @@ def generate_inner(self, message, dataset=None): ) # generate outputs = self.model.generate( - inputs.input_ids.to(self.cuda), + inputs.input_ids.to('cuda'), GENERATION_CONFIG, - attention_mask=inputs.attention_mask.to(self.cuda), + attention_mask=inputs.attention_mask.to('cuda'), ) outputs = outputs[:, inputs.input_ids.shape[-1]:] @@ -170,7 +202,7 @@ def generate_inner(self, message, dataset=None): class Emu3_gen(BaseModel): - INSTALL_REQ = True + INSTALL_REQ = False INTERLEAVE = False def __init__(self, @@ -207,7 +239,6 @@ def __init__(self, trust_remote_code=True).eval() self.processor = Emu3Processor(self.image_processor, self.image_tokenizer, self.tokenizer) self.kwargs = kwargs - self.cuda = cuda self.output_path = output_path def generate_inner(self, message, dataset=None):