-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathray-service.vllm.yaml
81 lines (81 loc) · 2.71 KB
/
ray-service.vllm.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
apiVersion: ray.io/v1alpha1
kind: RayService
metadata:
name: vllm
spec:
serviceUnhealthySecondThreshold: 1800 # Config for the health check threshold for service. Default value is 60.
deploymentUnhealthySecondThreshold: 1800 # Config for the health check threshold for deployments. Default value is 60.
serveConfigV2: |
applications:
- name: vllm
import_path: vllm_falcon_7b:deployment
runtime_env:
working_dir: "https://github.com/vecorro/vllm_examples/archive/refs/heads/main.zip"
pip: ["vllm==0.1.3"]
rayClusterConfig:
rayVersion: '2.6.2' # Should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# Ray head pod template.
headGroupSpec:
serviceType: LoadBalancer
# The `rayStartParams` are used to configure the `ray start` command.
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
rayStartParams:
dashboard-host: '0.0.0.0'
# Pod template
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.6.2-py310-cu118
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: 32
memory: "64G"
requests:
cpu: 16
memory: "32G"
volumes:
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
# The pod replicas in this group typed worker
- replicas: 1
minReplicas: 1
maxReplicas: 4
groupName: gpu-group
rayStartParams: {}
# Pod template
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.6.2-py310-cu118
resources:
limits:
cpu: 32
memory: "64G"
nvidia.com/gpu: 1
requests:
cpu: 16
memory: "32G"
nvidia.com/gpu: 1
# Please add the following taints to the GPU node.
tolerations:
- key: "ray.io/node-type"
operator: "Equal"
value: "worker"
effect: "NoSchedule"