From 78905b95b81987be63ca302e4e8a4a15191d964e Mon Sep 17 00:00:00 2001
From: Aleksandr Movchan <aleksandr@mobiuslabs.com>
Date: Tue, 12 Dec 2023 13:40:15 +0000
Subject: [PATCH] Bug fixes

---
 aana/api/api_generation.py          | 13 +++----------
 aana/api/app.py                     |  4 +++-
 aana/configs/deployments.py         | 12 ++++++------
 aana/deployments/vllm_deployment.py | 15 ++++++++++-----
 aana/utils/video.py                 |  4 ++--
 mobius-pipeline                     |  2 +-
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/aana/api/api_generation.py b/aana/api/api_generation.py
index 1168a51b..edb7e8c7 100644
--- a/aana/api/api_generation.py
+++ b/aana/api/api_generation.py
@@ -6,6 +6,7 @@
 
 from fastapi import FastAPI, File, Form, UploadFile
 from fastapi.responses import StreamingResponse
+from mobius_pipeline.exceptions import BaseException
 from mobius_pipeline.node.socket import Socket
 from mobius_pipeline.pipeline.pipeline import Pipeline
 from pydantic import BaseModel, Field, ValidationError, create_model, parse_raw_as
@@ -345,10 +346,6 @@ async def route_func_body(body: str, files: list[UploadFile] | None = None):  #
             data_dict = {}
             for field_name in data.__fields__:
                 field_value = getattr(data, field_name)
-                # check if it has a method convert_to_entities
-                # if it does, call it to convert the model to an entity
-                # if hasattr(field_value, "convert_input_to_object"):
-                #     field_value = field_value.convert_input_to_object()
                 data_dict[field_name] = field_value
 
             if self.output_filter:
@@ -393,14 +390,10 @@ async def generator_wrapper() -> AsyncGenerator[bytes, None]:
                             output = self.process_output(output)
                             yield AanaJSONResponse(content=output).body
                     except RayTaskError as e:
-                        print(f"Got exception: {e} Type: {type(e)}")
                         yield custom_exception_handler(None, e).body
-                    # except BaseException as e:
-                    #     print(f"Got exception: {e} Type: {type(e)}")
-                    #     yield custom_exception_handler(None, e)
+                    except BaseException as e:
+                        yield custom_exception_handler(None, e)
                     except Exception as e:
-                        print(f"Got exception: {e} Type: {type(e)}")
-                        # yield custom_exception_handler(None, e).body
                         error = e.__class__.__name__
                         stacktrace = traceback.format_exc()
                         yield AanaJSONResponse(
diff --git a/aana/api/app.py b/aana/api/app.py
index e341c1ac..dcc13a2d 100644
--- a/aana/api/app.py
+++ b/aana/api/app.py
@@ -32,7 +32,9 @@ async def validation_exception_handler(request: Request, exc: ValidationError):
     )
 
 
-def custom_exception_handler(request: Request, exc_raw: BaseException | RayTaskError):
+def custom_exception_handler(
+    request: Request | None, exc_raw: BaseException | RayTaskError
+):
     """This handler is used to handle custom exceptions raised in the application.
 
     BaseException is the base exception for all the exceptions
diff --git a/aana/configs/deployments.py b/aana/configs/deployments.py
index d388e738..4e68b6b1 100644
--- a/aana/configs/deployments.py
+++ b/aana/configs/deployments.py
@@ -13,12 +13,12 @@
     "vllm_deployment_llama2_7b_chat": VLLMDeployment.options(
         num_replicas=1,
         max_concurrent_queries=1000,
-        ray_actor_options={"num_gpus": 0.9},
+        ray_actor_options={"num_gpus": 0.25},
         user_config=VLLMConfig(
             model="TheBloke/Llama-2-7b-Chat-AWQ",
             dtype="auto",
             quantization="awq",
-            gpu_memory_utilization=0.9,
+            gpu_memory_reserved=10000,
             default_sampling_params=SamplingParams(
                 temperature=1.0, top_p=1.0, top_k=-1, max_tokens=256
             ),
@@ -28,12 +28,12 @@
     "vllm_deployment_zephyr_7b_beta": VLLMDeployment.options(
         num_replicas=1,
         max_concurrent_queries=1000,
-        ray_actor_options={"num_gpus": 0.5},
+        ray_actor_options={"num_gpus": 0.25},
         user_config=VLLMConfig(
             model="TheBloke/zephyr-7B-beta-AWQ",
             dtype="auto",
             quantization="awq",
-            gpu_memory_utilization=0.9,
+            gpu_memory_reserved=10000,
             max_model_len=512,
             default_sampling_params=SamplingParams(
                 temperature=1.0, top_p=1.0, top_k=-1, max_tokens=256
@@ -43,7 +43,7 @@
     "hf_blip2_deployment_opt_2_7b": HFBlip2Deployment.options(
         num_replicas=1,
         max_concurrent_queries=1000,
-        ray_actor_options={"num_gpus": 0.45},
+        ray_actor_options={"num_gpus": 0.25},
         user_config=HFBlip2Config(
             model="Salesforce/blip2-opt-2.7b",
             dtype=Dtype.FLOAT16,
@@ -54,7 +54,7 @@
     "whisper_deployment_medium": WhisperDeployment.options(
         num_replicas=1,
         max_concurrent_queries=1000,
-        ray_actor_options={"num_gpus": 0.45},
+        ray_actor_options={"num_gpus": 0.25},
         user_config=WhisperConfig(
             model_size=WhisperModelSize.MEDIUM,
             compute_type=WhisperComputeType.FLOAT16,
diff --git a/aana/deployments/vllm_deployment.py b/aana/deployments/vllm_deployment.py
index f531c95a..998c25aa 100644
--- a/aana/deployments/vllm_deployment.py
+++ b/aana/deployments/vllm_deployment.py
@@ -7,7 +7,7 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.model_executor.utils import set_random_seed
 from vllm.sampling_params import SamplingParams as VLLMSamplingParams
-from vllm.utils import random_uuid
+from vllm.utils import get_gpu_memory, random_uuid
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException
@@ -24,7 +24,7 @@ class VLLMConfig(BaseModel):
         model (str): the model name
         dtype (str): the data type (optional, default: "auto")
         quantization (str): the quantization method (optional, default: None)
-        gpu_memory_utilization (float): the GPU memory utilization.
+        gpu_memory_reserved (float): the GPU memory reserved for the model in mb
         default_sampling_params (SamplingParams): the default sampling parameters.
         max_model_len (int): the maximum generated text length in tokens (optional, default: None)
     """
@@ -32,7 +32,7 @@ class VLLMConfig(BaseModel):
     model: str
     dtype: str | None = Field(default="auto")
     quantization: str | None = Field(default=None)
-    gpu_memory_utilization: float
+    gpu_memory_reserved: float
     default_sampling_params: SamplingParams
     max_model_len: int | None = Field(default=None)
     chat_template: str | None = Field(default=None)
@@ -83,7 +83,7 @@ async def apply_config(self, config: dict[str, Any]):
         - model: the model name
         - dtype: the data type (optional, default: "auto")
         - quantization: the quantization method (optional, default: None)
-        - gpu_memory_utilization: the GPU memory utilization.
+        - gpu_memory_reserved: the GPU memory reserved for the model in mb
         - default_sampling_params: the default sampling parameters.
         - max_model_len: the maximum generated text length in tokens (optional, default: None)
         - chat_template: the name of the chat template (optional, default: None)
@@ -93,6 +93,11 @@ async def apply_config(self, config: dict[str, Any]):
         """
         config_obj = VLLMConfig(**config)
         self.model = config_obj.model
+        total_gpu_memory_bytes = get_gpu_memory()
+        total_gpu_memory_mb = total_gpu_memory_bytes / 1024**2
+        self.gpu_memory_utilization = (
+            config_obj.gpu_memory_reserved / total_gpu_memory_mb
+        )
         self.default_sampling_params: SamplingParams = (
             config_obj.default_sampling_params
         )
@@ -101,7 +106,7 @@ async def apply_config(self, config: dict[str, Any]):
             model=config_obj.model,
             dtype=config_obj.dtype,
             quantization=config_obj.quantization,
-            gpu_memory_utilization=config_obj.gpu_memory_utilization,
+            gpu_memory_utilization=self.gpu_memory_utilization,
             max_model_len=config_obj.max_model_len,
         )
 
diff --git a/aana/utils/video.py b/aana/utils/video.py
index 60474f35..19976d6a 100644
--- a/aana/utils/video.py
+++ b/aana/utils/video.py
@@ -1,4 +1,4 @@
-import json
+import json  # noqa: I001
 import pickle
 from collections import defaultdict
 from collections.abc import Generator
@@ -7,7 +7,7 @@
 from typing import TypedDict
 
 import numpy as np
-import torch, decord  # See https://github.com/dmlc/decord/issues/263  # noqa: F401
+import torch, decord  # noqa: F401  # See https://github.com/dmlc/decord/issues/263
 import yt_dlp
 from yt_dlp.utils import DownloadError
 
diff --git a/mobius-pipeline b/mobius-pipeline
index 386943bd..65aa0048 160000
--- a/mobius-pipeline
+++ b/mobius-pipeline
@@ -1 +1 @@
-Subproject commit 386943bd78d8c3617013ac52bd18a92be0e19c5e
+Subproject commit 65aa004801a47036247f76d9ef058c976fdb22c1