ray-service.vllm.yaml

apiVersion: ray.io/v1alpha1
kind: RayService
metadata:
  name: vllm
spec:
  serviceUnhealthySecondThreshold: 1800 # Config for the health check threshold for service. Default value is 60.
  deploymentUnhealthySecondThreshold: 1800 # Config for the health check threshold for deployments. Default value is 60.
  serveConfigV2: |
    applications:
      - name: vllm
        import_path: vllm_falcon_7b:deployment
        runtime_env:
          working_dir: "https://github.com/vecorro/vllm_examples/archive/refs/heads/main.zip"
          pip: ["vllm==0.1.3"]
  rayClusterConfig:
    rayVersion: '2.6.2' # Should match the Ray version in the image of the containers
    ######################headGroupSpecs#################################
    # Ray head pod template.
    headGroupSpec:
      serviceType: LoadBalancer
      # The `rayStartParams` are used to configure the `ray start` command.
      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
      rayStartParams:
        dashboard-host: '0.0.0.0'
      # Pod template
      template:
        spec:
          containers:
          - name: ray-head
            image: rayproject/ray:2.6.2-py310-cu118
            ports:
            - containerPort: 6379
              name: gcs
            - containerPort: 8265
              name: dashboard
            - containerPort: 10001
              name: client
            - containerPort: 8000
              name: serve
            volumeMounts:
              - mountPath: /tmp/ray
                name: ray-logs
            resources:
              limits:
                cpu: 32
                memory: "64G"
              requests:
                cpu: 16
                memory: "32G"
          volumes:
            - name: ray-logs
              emptyDir: {}
    workerGroupSpecs:
    # The pod replicas in this group typed worker
    - replicas: 1
      minReplicas: 1
      maxReplicas: 4
      groupName: gpu-group
      rayStartParams: {}
      # Pod template
      template:
        spec:
          containers:
          - name: ray-worker
            image: rayproject/ray:2.6.2-py310-cu118
            resources:
              limits:
                cpu: 32
                memory: "64G"
                nvidia.com/gpu: 1
              requests:
                cpu: 16
                memory: "32G"
                nvidia.com/gpu: 1
          # Please add the following taints to the GPU node.
          tolerations:
            - key: "ray.io/node-type"
              operator: "Equal"
              value: "worker"
              effect: "NoSchedule"