From 78905b95b81987be63ca302e4e8a4a15191d964e Mon Sep 17 00:00:00 2001 From: Aleksandr Movchan Date: Tue, 12 Dec 2023 13:40:15 +0000 Subject: [PATCH] Bug fixes --- aana/api/api_generation.py | 13 +++---------- aana/api/app.py | 4 +++- aana/configs/deployments.py | 12 ++++++------ aana/deployments/vllm_deployment.py | 15 ++++++++++----- aana/utils/video.py | 4 ++-- mobius-pipeline | 2 +- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/aana/api/api_generation.py b/aana/api/api_generation.py index 1168a51b..edb7e8c7 100644 --- a/aana/api/api_generation.py +++ b/aana/api/api_generation.py @@ -6,6 +6,7 @@ from fastapi import FastAPI, File, Form, UploadFile from fastapi.responses import StreamingResponse +from mobius_pipeline.exceptions import BaseException from mobius_pipeline.node.socket import Socket from mobius_pipeline.pipeline.pipeline import Pipeline from pydantic import BaseModel, Field, ValidationError, create_model, parse_raw_as @@ -345,10 +346,6 @@ async def route_func_body(body: str, files: list[UploadFile] | None = None): # data_dict = {} for field_name in data.__fields__: field_value = getattr(data, field_name) - # check if it has a method convert_to_entities - # if it does, call it to convert the model to an entity - # if hasattr(field_value, "convert_input_to_object"): - # field_value = field_value.convert_input_to_object() data_dict[field_name] = field_value if self.output_filter: @@ -393,14 +390,10 @@ async def generator_wrapper() -> AsyncGenerator[bytes, None]: output = self.process_output(output) yield AanaJSONResponse(content=output).body except RayTaskError as e: - print(f"Got exception: {e} Type: {type(e)}") yield custom_exception_handler(None, e).body - # except BaseException as e: - # print(f"Got exception: {e} Type: {type(e)}") - # yield custom_exception_handler(None, e) + except BaseException as e: + yield custom_exception_handler(None, e) except Exception as e: - print(f"Got exception: {e} Type: {type(e)}") - # yield custom_exception_handler(None, e).body error = e.__class__.__name__ stacktrace = traceback.format_exc() yield AanaJSONResponse( diff --git a/aana/api/app.py b/aana/api/app.py index e341c1ac..dcc13a2d 100644 --- a/aana/api/app.py +++ b/aana/api/app.py @@ -32,7 +32,9 @@ async def validation_exception_handler(request: Request, exc: ValidationError): ) -def custom_exception_handler(request: Request, exc_raw: BaseException | RayTaskError): +def custom_exception_handler( + request: Request | None, exc_raw: BaseException | RayTaskError +): """This handler is used to handle custom exceptions raised in the application. BaseException is the base exception for all the exceptions diff --git a/aana/configs/deployments.py b/aana/configs/deployments.py index d388e738..4e68b6b1 100644 --- a/aana/configs/deployments.py +++ b/aana/configs/deployments.py @@ -13,12 +13,12 @@ "vllm_deployment_llama2_7b_chat": VLLMDeployment.options( num_replicas=1, max_concurrent_queries=1000, - ray_actor_options={"num_gpus": 0.9}, + ray_actor_options={"num_gpus": 0.25}, user_config=VLLMConfig( model="TheBloke/Llama-2-7b-Chat-AWQ", dtype="auto", quantization="awq", - gpu_memory_utilization=0.9, + gpu_memory_reserved=10000, default_sampling_params=SamplingParams( temperature=1.0, top_p=1.0, top_k=-1, max_tokens=256 ), @@ -28,12 +28,12 @@ "vllm_deployment_zephyr_7b_beta": VLLMDeployment.options( num_replicas=1, max_concurrent_queries=1000, - ray_actor_options={"num_gpus": 0.5}, + ray_actor_options={"num_gpus": 0.25}, user_config=VLLMConfig( model="TheBloke/zephyr-7B-beta-AWQ", dtype="auto", quantization="awq", - gpu_memory_utilization=0.9, + gpu_memory_reserved=10000, max_model_len=512, default_sampling_params=SamplingParams( temperature=1.0, top_p=1.0, top_k=-1, max_tokens=256 @@ -43,7 +43,7 @@ "hf_blip2_deployment_opt_2_7b": HFBlip2Deployment.options( num_replicas=1, max_concurrent_queries=1000, - ray_actor_options={"num_gpus": 0.45}, + ray_actor_options={"num_gpus": 0.25}, user_config=HFBlip2Config( model="Salesforce/blip2-opt-2.7b", dtype=Dtype.FLOAT16, @@ -54,7 +54,7 @@ "whisper_deployment_medium": WhisperDeployment.options( num_replicas=1, max_concurrent_queries=1000, - ray_actor_options={"num_gpus": 0.45}, + ray_actor_options={"num_gpus": 0.25}, user_config=WhisperConfig( model_size=WhisperModelSize.MEDIUM, compute_type=WhisperComputeType.FLOAT16, diff --git a/aana/deployments/vllm_deployment.py b/aana/deployments/vllm_deployment.py index f531c95a..998c25aa 100644 --- a/aana/deployments/vllm_deployment.py +++ b/aana/deployments/vllm_deployment.py @@ -7,7 +7,7 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.model_executor.utils import set_random_seed from vllm.sampling_params import SamplingParams as VLLMSamplingParams -from vllm.utils import random_uuid +from vllm.utils import get_gpu_memory, random_uuid from aana.deployments.base_deployment import BaseDeployment from aana.exceptions.general import InferenceException @@ -24,7 +24,7 @@ class VLLMConfig(BaseModel): model (str): the model name dtype (str): the data type (optional, default: "auto") quantization (str): the quantization method (optional, default: None) - gpu_memory_utilization (float): the GPU memory utilization. + gpu_memory_reserved (float): the GPU memory reserved for the model in mb default_sampling_params (SamplingParams): the default sampling parameters. max_model_len (int): the maximum generated text length in tokens (optional, default: None) """ @@ -32,7 +32,7 @@ class VLLMConfig(BaseModel): model: str dtype: str | None = Field(default="auto") quantization: str | None = Field(default=None) - gpu_memory_utilization: float + gpu_memory_reserved: float default_sampling_params: SamplingParams max_model_len: int | None = Field(default=None) chat_template: str | None = Field(default=None) @@ -83,7 +83,7 @@ async def apply_config(self, config: dict[str, Any]): - model: the model name - dtype: the data type (optional, default: "auto") - quantization: the quantization method (optional, default: None) - - gpu_memory_utilization: the GPU memory utilization. + - gpu_memory_reserved: the GPU memory reserved for the model in mb - default_sampling_params: the default sampling parameters. - max_model_len: the maximum generated text length in tokens (optional, default: None) - chat_template: the name of the chat template (optional, default: None) @@ -93,6 +93,11 @@ async def apply_config(self, config: dict[str, Any]): """ config_obj = VLLMConfig(**config) self.model = config_obj.model + total_gpu_memory_bytes = get_gpu_memory() + total_gpu_memory_mb = total_gpu_memory_bytes / 1024**2 + self.gpu_memory_utilization = ( + config_obj.gpu_memory_reserved / total_gpu_memory_mb + ) self.default_sampling_params: SamplingParams = ( config_obj.default_sampling_params ) @@ -101,7 +106,7 @@ async def apply_config(self, config: dict[str, Any]): model=config_obj.model, dtype=config_obj.dtype, quantization=config_obj.quantization, - gpu_memory_utilization=config_obj.gpu_memory_utilization, + gpu_memory_utilization=self.gpu_memory_utilization, max_model_len=config_obj.max_model_len, ) diff --git a/aana/utils/video.py b/aana/utils/video.py index 60474f35..19976d6a 100644 --- a/aana/utils/video.py +++ b/aana/utils/video.py @@ -1,4 +1,4 @@ -import json +import json # noqa: I001 import pickle from collections import defaultdict from collections.abc import Generator @@ -7,7 +7,7 @@ from typing import TypedDict import numpy as np -import torch, decord # See https://github.com/dmlc/decord/issues/263 # noqa: F401 +import torch, decord # noqa: F401 # See https://github.com/dmlc/decord/issues/263 import yt_dlp from yt_dlp.utils import DownloadError diff --git a/mobius-pipeline b/mobius-pipeline index 386943bd..65aa0048 160000 --- a/mobius-pipeline +++ b/mobius-pipeline @@ -1 +1 @@ -Subproject commit 386943bd78d8c3617013ac52bd18a92be0e19c5e +Subproject commit 65aa004801a47036247f76d9ef058c976fdb22c1