speaches-ai · fedirz · Sep 5, 2024 · Sep 2, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/Taskfile.yaml b/Taskfile.yaml
@@ -31,7 +31,6 @@ tasks:
     sources:
       - Dockerfile.*
       - faster_whisper_server/*.py
-  sync: lsyncd lsyncd.conf
   cii:
     cmds:
       - act --rm --action-offline-mode --secret-file .secrets {{.CLI_ARGS}}

diff --git a/faster_whisper_server/config.py b/faster_whisper_server/config.py
@@ -1,6 +1,7 @@
 import enum
+from typing import Self
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 SAMPLES_PER_SECOND = 16000
@@ -151,7 +152,9 @@ class WhisperConfig(BaseModel):
 
     model: str = Field(default="Systran/faster-whisper-medium.en")
     """
-    Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
+    Default Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
+    This model will be used if no model is specified in the request.
+
     Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
     You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
     """
@@ -199,6 +202,16 @@ class Config(BaseSettings):
     """
     Maximum number of models that can be loaded at a time.
     """
+    preload_models: list[str] = Field(
+        default_factory=list,
+        examples=[
+            ["Systran/faster-whisper-medium.en"],
+            ["Systran/faster-whisper-medium.en", "Systran/faster-whisper-small.en"],
+        ],
+    )
+    """
+    List of models to preload on startup. Shouldn't be greater than `max_models`. By default, the model is first loaded on first request.
+    """  # noqa: E501
     max_no_data_seconds: float = 1.0
     """
     Max duration to wait for the next audio chunk before transcription is finilized and connection is closed.
@@ -218,5 +231,13 @@ class Config(BaseSettings):
     Should be greater than `max_inactivity_seconds`
     """
 
+    @model_validator(mode="after")
+    def ensure_preloaded_models_is_lte_max_models(self) -> Self:
+        if len(self.preload_models) > self.max_models:
+            raise ValueError(
+                f"Number of preloaded models ({len(self.preload_models)}) is greater than max_models ({self.max_models})"  # noqa: E501
+            )
+        return self
+
 
 config = Config()
diff --git a/faster_whisper_server/main.py b/faster_whisper_server/main.py
@@ -2,6 +2,8 @@
 
 import asyncio
 from collections import OrderedDict
+from contextlib import asynccontextmanager
+import gc
 from io import BytesIO
 import time
 from typing import TYPE_CHECKING, Annotated, Literal
@@ -45,7 +47,7 @@
 from faster_whisper_server.transcriber import audio_transcriber
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable
+    from collections.abc import AsyncGenerator, Generator, Iterable
 
     from faster_whisper.transcribe import TranscriptionInfo
     from huggingface_hub.hf_api import ModelInfo
@@ -63,7 +65,7 @@ def load_model(model_name: str) -> WhisperModel:
         del loaded_models[oldest_model_name]
     logger.debug(f"Loading {model_name}...")
     start = time.perf_counter()
-    # NOTE: will raise an exception if the model name isn't valid
+    # NOTE: will raise an exception if the model name isn't valid. Should I do an explicit check?
     whisper = WhisperModel(
         model_name,
         device=config.whisper.inference_device,
@@ -81,7 +83,15 @@ def load_model(model_name: str) -> WhisperModel:
 
 logger.debug(f"Config: {config}")
 
-app = FastAPI()
+
+@asynccontextmanager
+async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
+    for model_name in config.preload_models:
+        load_model(model_name)
+    yield
+
+
+app = FastAPI(lifespan=lifespan)
 
 if config.allow_origins is not None:
     app.add_middleware(
@@ -98,6 +108,29 @@ def health() -> Response:
     return Response(status_code=200, content="OK")
 
 
+@app.get("/api/ps", tags=["experimental"], summary="Get a list of loaded models.")
+def get_running_models() -> dict[str, list[str]]:
+    return {"models": list(loaded_models.keys())}
+
+
+@app.post("/api/ps/{model_name:path}", tags=["experimental"], summary="Load a model into memory.")
+def load_model_route(model_name: str) -> Response:
+    if model_name in loaded_models:
+        return Response(status_code=409, content="Model already loaded")
+    load_model(model_name)
+    return Response(status_code=201)
+
+
+@app.delete("/api/ps/{model_name:path}", tags=["experimental"], summary="Unload a model from memory.")
+def stop_running_model(model_name: str) -> Response:
+    model = loaded_models.get(model_name)
+    if model is not None:
+        del loaded_models[model_name]
+        gc.collect()
+        return Response(status_code=204)
+    return Response(status_code=404)
+
+
 @app.get("/v1/models")
 def get_models() -> ModelListResponse:
     models = huggingface_hub.list_models(library="ctranslate2", tags="automatic-speech-recognition", cardData=True)

diff --git a/flake.nix b/flake.nix
@@ -20,19 +20,32 @@
               act
               ffmpeg-full
               go-task
-              lsyncd
               parallel
               pre-commit
               pv
               python312
               rsync
               websocat
               uv
+              cudaPackages_12.cudnn_8_9
             ];
+
+            # https://github.com/NixOS/nixpkgs/issues/278976#issuecomment-1879685177
+            # NOTE: Without adding `/run/...` the following error occurs
+            # RuntimeError: CUDA failed with error CUDA driver version is insufficient for CUDA runtime version
+            #
+            # NOTE: sometimes it still doesn't work but rebooting the system fixes it
+            LD_LIBRARY_PATH = "/run/opengl-driver/lib:${
+              pkgs.lib.makeLibraryPath [
+                pkgs.cudaPackages_12.cudnn_8_9
+                pkgs.zlib
+                pkgs.stdenv.cc.cc
+                pkgs.openssl
+              ]
+            }";
+
             shellHook = ''
               source .venv/bin/activate
-              export LD_LIBRARY_PATH=${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH
-              export LD_LIBRARY_PATH=${pkgs.zlib}/lib:$LD_LIBRARY_PATH
               source .env
             '';
           };

diff --git a/lsyncd.conf b/lsyncd.conf
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,6 +70,7 @@ ignore = [
     "W505",
     "ISC001", # recommended to disable for formatting
     "INP001",
+    "PT018",
 ]
 
 [tool.ruff.lint.isort]

diff --git a/scripts/client.py b/scripts/client.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 import subprocess
 import threading
+import time
 
 import httpx
 import keyboard
@@ -14,11 +15,12 @@
 # The audio file will be sent to the server for transcription.
 # The transcription will be copied to the clipboard.
 # When having a short audio of a couple of sentences and running inference on a GPU the response time is very fast (less than 2 seconds).  # noqa: E501
+# Run this with `sudo -E python scripts/client.py`
 
 CHUNK = 2**12
 AUDIO_RECORD_CMD = [
     "ffmpeg",
-    # "-hide_banner",
+    "-hide_banner",
     # "-loglevel",
     # "quiet",
     "-f",
@@ -27,15 +29,6 @@
     "default",
     "-f",
     "wav",
-    # "-ac",
-    # "1",
-    # "-ar",
-    # "16000",
-    # "-f",
-    # "s16le",
-    # "-acodec",
-    # "pcm_s16le",
-    # "-",
 ]
 COPY_TO_CLIPBOARD_CMD = "wl-copy"
 OPENAI_BASE_URL = "ws://localhost:8000/v1"
@@ -48,12 +41,13 @@
 
 client = httpx.Client(base_url=OPENAI_BASE_URL, timeout=TIMEOUT)
 is_running = threading.Event()
-file = Path("test.wav")  # TODO: use tempfile
+
+file = Path("test.wav")  # HACK: I had a hard time trying to use a temporary file due to permissions issues
 
 
 while True:
     keyboard.wait(KEYBIND)
-    print("Action started")
+    print("Recording started")
     process = subprocess.Popen(
         [*AUDIO_RECORD_CMD, "-y", str(file.name)],
         stdout=subprocess.PIPE,
@@ -63,17 +57,27 @@
     )
     keyboard.wait(KEYBIND)
     process.kill()
-    print("Action finished")
+    stdout, stderr = process.communicate()
+    if stdout or stderr:
+        print(f"stdout: {stdout}")
+        print(f"stderr: {stderr}")
+    print(f"Recording finished. File size: {file.stat().st_size} bytes")
 
-    with open(file, "rb") as f:
-        res = client.post(
-            OPENAI_BASE_URL + TRANSCRIBE_PATH,
-            files={"file": f},
-            data={
-                "response_format": RESPONSE_FORMAT,
-                "language": LANGUAGE,
-            },
-        )
+    try:
+        with open(file, "rb") as fd:
+            start = time.perf_counter()
+            res = client.post(
+                OPENAI_BASE_URL + TRANSCRIBE_PATH,
+                files={"file": fd},
+                data={
+                    "response_format": RESPONSE_FORMAT,
+                    "language": LANGUAGE,
+                },
+            )
+        end = time.perf_counter()
+        print(f"Transcription took {end - start} seconds")
         transcription = res.text
         print(transcription)
         subprocess.run([COPY_TO_CLIPBOARD_CMD], input=transcription.encode(), check=True)
+    except httpx.ConnectError as e:
+        print(f"Couldn't connect to server: {e}")