Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ray-serve-vllm deployment sample #1020

Merged
merged 1 commit into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
5 changes: 5 additions & 0 deletions deploy/ray-serve-vllm/Dockerfile.app
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# the base image is built from Dockerfile.vllm.ray
FROM vllm/vllm-openai:ray-2.11.0-py3.10.12-patched

# Copy the packaged python application
COPY llm-serving-app.zip /vllm-workspace/
10 changes: 10 additions & 0 deletions deploy/ray-serve-vllm/Dockerfile.vllm.ray
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Use vllm 0.4.1 and ray will be 2.11.0 for now
# python version is py3.10.12
FROM vllm/vllm-openai:v0.4.1

# wget for health check and ray for default packages
RUN apt-get install -y curl wget && pip install 'ray[default]' -i https://pypi.mirrors.ustc.edu.cn/simple/

# Patch for vllm and can be remove once https://github.com/vllm-project/vllm/issues/2683 is fixed
COPY vllm-patched/serving_chat.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_chat.py
COPY vllm-patched/serving_engine.py /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_engine.py
141 changes: 141 additions & 0 deletions deploy/ray-serve-vllm/llm-serving-app/llm-serving.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import json
import logging
from typing import AsyncGenerator

import ray
import fastapi
# from huggingface_hub import login
from ray import serve

from fastapi import Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse

import vllm
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest, ErrorResponse)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext


TIMEOUT_KEEP_ALIVE = 5 # seconds

logger = logging.getLogger("ray.serve")

app = fastapi.FastAPI()

# Modified based on https://github.com/vllm-project/vllm/blob/v0.4.1/vllm/entrypoints/openai/api_server.py

@serve.deployment(num_replicas=1)
@serve.ingress(app)
class VLLMPredictDeployment():
def __init__(self, **kwargs):
"""
Construct a VLLM deployment.

Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
for the full list of arguments.

Args:
model: name or path of the huggingface model to use
download_dir: directory to download and load the weights,
default to the default cache dir of huggingface.
use_np_weights: save a numpy copy of model weights for
faster loading. This can increase the disk usage by up to 2x.
use_dummy_weights: use dummy values for model weights.
dtype: data type for model weights and activations.
The "auto" option will use FP16 precision
for FP32 and FP16 models, and BF16 precision.
for BF16 models.
seed: random seed.
worker_use_ray: use Ray for distributed serving, will be
automatically set when using more than 1 GPU
pipeline_parallel_size: number of pipeline stages.
tensor_parallel_size: number of tensor parallel replicas.
block_size: token block size.
swap_space: CPU swap space size (GiB) per GPU.
gpu_memory_utilization: the percentage of GPU memory to be used for
the model executor
max_num_batched_tokens: maximum number of batched tokens per iteration
max_num_seqs: maximum number of sequences per iteration.
disable_log_stats: disable logging statistics.
engine_use_ray: use Ray to start the LLM engine in a separate
process as the server process.
disable_log_requests: disable logging requests.
"""
kwargs = {**kwargs, 'tensor_parallel_size': 1, 'gpu_memory_utilization': 0.9, 'model': '/data/models/qwen1.5-7b-chat', 'trust_remote_code': 'true', 'worker_use_ray': 'true', 'max_model_len': 6000}

logger.info(f"vLLM API server version {vllm.__version__}")
logger.info(f"kwargs: {kwargs}")

args = AsyncEngineArgs(**kwargs)
logger.info(f"args: {args}")
served_model = args.model
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
args.response_role = ""
args.lora_modules = ""
args.chat_template = "./templates/chat-template-qwen.jinja2"
self.openai_serving_chat = OpenAIServingChat(engine, served_model,
args.response_role,
args.lora_modules,
args.chat_template)
self.openai_serving_completion = OpenAIServingCompletion(
engine, served_model, args.lora_modules)


@app.get("/health")
async def health(self) -> Response:
"""Health check."""
await self.openai_serving_chat.engine.check_health()
return Response(status_code=200)


@app.get("/v1/models")
async def show_available_models(self):
models = await self.openai_serving_chat.show_available_models()
return JSONResponse(content=models.model_dump())


@app.get("/version")
async def show_version(self):
ver = {"version": vllm.__version__}
return JSONResponse(content=ver)


@app.post("/v1/chat/completions")
async def create_chat_completion(self, request: ChatCompletionRequest,
raw_request: Request):
generator = await self.openai_serving_chat.create_chat_completion(
request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
if request.stream:
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
return JSONResponse(content=generator.model_dump())


@app.post("/v1/completions")
async def create_completion(self, request: CompletionRequest, raw_request: Request):
generator = await self.openai_serving_completion.create_completion(
request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.code)
if request.stream:
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
return JSONResponse(content=generator.model_dump())

deployment = VLLMPredictDeployment.bind()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}
118 changes: 118 additions & 0 deletions deploy/ray-serve-vllm/raycluster-serve-llm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
apiVersion: ray.io/v1
kind: RayService
metadata:
name: rayservice-sample
namespace: kuberay-system
spec:
# serveConfigV2 takes a yaml multi-line scalar, which should be a Ray Serve multi-application config. See https://docs.ray.io/en/latest/serve/multi-app.html.
serviceUnhealthySecondThreshold: 900
deploymentUnhealthySecondThreshold: 300
serveConfigV2: |
applications:
- name: llm-serving-app
import_path: llm-serving:deployment
route_prefix: /
runtime_env:
working_dir: FILE:///vllm-workspace/llm-app.zip
deployments:
- name: VLLMPredictDeployment
num_replicas: 1
rayClusterConfig:
rayVersion: '2.11.0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# Ray head pod template.
headGroupSpec:
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
resources: '"{\"accelerator_type_cpu\": 4}"'
dashboard-host: '0.0.0.0'
#pod template
template:
spec:
volumes:
# mount the model from hostPath
- name: model-data
hostPath:
path: /data/models
type: Directory
- name: tz-config
hostPath:
path: /etc/localtime
containers:
- name: ray-head
image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app
#image: rayproject/ray-ml:2.10.0-py310-zip
resources:
limits:
cpu: 4
memory: 16Gi
requests:
cpu: 2
memory: 2Gi
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265 # Ray dashboard
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /data/models
name: model-data
- name: tz-config
mountPath: /etc/localtime
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 1
minReplicas: 0
maxReplicas: 5
# logical group name, for this called small-group, also can be functional
groupName: small-group
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
resources: '"{\"accelerator_type_cpu\": 4, \"accelerator_type_3090\": 1}"'
#pod template
template:
spec:
volumes:
- name: model-data
hostPath:
path: /NFS/125_bakup/models
type: Directory
- name: tz-config
hostPath:
path: /etc/localtime
- name: dshm
emptyDir:
medium: Memory
sizeLimit: "5.24Gi"
containers:
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
#image: rayproject/ray-ml:2.10.0-py310-zip
image: vllm/vllm-openai:ray-2.11.0-py3.10.12-llm-app
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "4"
memory: "16Gi"
nvidia.com/gpu: 1
requests:
cpu: "2"
memory: "2Gi"
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /data/models
name: model-data
- name: tz-config
mountPath: /etc/localtime
- mountPath: /dev/shm
name: dshm
7 changes: 0 additions & 7 deletions deploy/ray/Dockerfile

This file was deleted.

Loading