Skip to content

Commit

Permalink
Support multiple card for LLM inference
Browse files Browse the repository at this point in the history
DeepSeek models require more than one Gaudi cards.
Defined NUM_CARDS for user to configure.
Pass NUM_CARDS to container as compose does.
Pass NUM_SHARD to tgi and --tensor-parellel-size for vllm.
Fix CI detect helm chart issue.

Signed-off-by: Dolpher Du <[email protected]>
  • Loading branch information
yongfengdu committed Feb 12, 2025
1 parent 3016f5f commit 26efce2
Show file tree
Hide file tree
Showing 14 changed files with 32 additions and 14 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/push-release-charts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,13 @@ jobs:
# Update Examples
e2e_charts=$(git diff --name-only ${base_commit} ${merged_commit} | \
grep "^$CHARTS_DIR" | \
grep -vE 'README.md|valuefiles.yaml|common|*.sh' | \
grep -vE 'valuefiles.yaml|common|*.md|*.sh' | \
cut -d'/' -f2 | sort -u )
echo "Charts to be updated: $e2e_charts"
echo "${{ secrets.ACTION_TOKEN }}" | helm registry login ghcr.io -u opea --password-stdin
pushd $CHARTS_DIR
for chart in ${e2e_charts}; do
if [ -f $chart ]; then continue; fi
echo "Updating $chart"
helm dependency update ${chart}
helm package $chart
Expand Down
6 changes: 4 additions & 2 deletions helm-charts/agentqna/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ tgi:
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
extraCmdArgs: ["--sharded", "true", "--num-shard", "4"]
NUM_CARDS: 4
SHARDED: "true"

vllm:
enabled: false
Expand All @@ -151,7 +152,8 @@ vllm:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
VLLM_SKIP_WARMUP: true
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384"]
NUM_CARDS: 4
extraCmdArgs: ["--max-seq_len-to-capture", "16384"]

nginx:
service:
Expand Down
3 changes: 1 addition & 2 deletions helm-charts/chatqna/gaudi-vllm-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ vllm:

PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"

NUM_CARDS: 1
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
Expand Down
3 changes: 1 addition & 2 deletions helm-charts/chatqna/guardrails-gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,8 @@ vllm:

PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"

NUM_CARDS: 1
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
Expand Down
5 changes: 3 additions & 2 deletions helm-charts/common/agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ tgi:
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
extraCmdArgs: ["--sharded", "true", "--num-shard", "4"]
NUM_CARDS: 4

vllm:
enabled: false
Expand All @@ -38,7 +38,8 @@ vllm:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
VLLM_SKIP_WARMUP: true
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384"]
NUM_CARDS: 4
extraCmdArgs: ["--max-seq_len-to-capture", "16384"]

replicaCount: 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ vllm:
tag: "latest"
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
OMPI_MCA_btl_vader_single_copy_mechanism: none
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
NUM_CARDS: 1
extraCmdArgs: ["--block-size", "128", "--max-num-seqs", "256", "--max-seq_len-to-capture", "2048"]
resources:
limits:
habana.ai/gaudi: 1
3 changes: 2 additions & 1 deletion helm-charts/common/llm-uservice/vllm-gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ vllm:
tag: "latest"
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
OMPI_MCA_btl_vader_single_copy_mechanism: none
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
NUM_CARDS: 1
extraCmdArgs: ["--block-size", "128", "--max-num-seqs", "256", "--max-seq_len-to-capture", "2048"]
resources:
limits:
habana.ai/gaudi: 1
Expand Down
7 changes: 7 additions & 0 deletions helm-charts/common/tgi/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,10 @@ data:
{{- if .Values.BATCH_BUCKET_SIZE }}
BATCH_BUCKET_SIZE: {{ .Values.BATCH_BUCKET_SIZE | quote }}
{{- end }}
{{- if .Values.SHARDED }}
SHARDED: {{ .Values.SHARDED }}
{{- end }}
{{- if .Values.NUM_CARDS }}
NUM_CARDS: {{ .Values.NUM_CARDS | quote }}
NUM_SHARD: {{ .Values.NUM_CARDS | quote }}
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/common/tgi/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ MAX_TOTAL_TOKENS: ""
CUDA_GRAPHS: "0"
HF_HUB_DISABLE_PROGRESS_BARS: "1"
HF_HUB_ENABLE_HF_TRANSFER: "0"
NUM_CARDS: "1"

global:
http_proxy: ""
Expand Down
3 changes: 2 additions & 1 deletion helm-charts/common/vllm/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ image:

# VLLM_CPU_KVCACHE_SPACE: "40"
OMPI_MCA_btl_vader_single_copy_mechanism: none
extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
extraCmdArgs: ["--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
NUM_CARDS: "1"
resources:
limits:
habana.ai/gaudi: 1
3 changes: 3 additions & 0 deletions helm-charts/common/vllm/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,6 @@ data:
{{- if .Values.VLLM_TORCH_PROFILER_DIR }}
VLLM_TORCH_PROFILER_DIR: {{ .Values.VLLM_TORCH_PROFILER_DIR | quote }}
{{- end }}
{{- if .Values.NUM_CARDS }}
NUM_CARDS: {{ .Values.NUM_CARDS | quote }}
{{- end }}
2 changes: 2 additions & 0 deletions helm-charts/common/vllm/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ spec:
- {{ . | quote }}
{{- end }}
{{- end }}
- "--tensor-parallel-size"
- {{ .Values.NUM_CARDS | quote }}
- "--model"
- {{ .Values.LLM_MODEL_ID | quote }}
- "--host"
Expand Down
1 change: 1 addition & 0 deletions helm-charts/common/vllm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
OMPI_MCA_btl_vader_single_copy_mechanism: ""
PT_HPU_ENABLE_LAZY_COLLECTIVES: ""
VLLM_CPU_KVCACHE_SPACE: ""
NUM_CARDS: "1"

global:
http_proxy: ""
Expand Down
3 changes: 1 addition & 2 deletions helm-charts/docsum/gaudi-vllm-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ vllm:

PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"

NUM_CARDS: 1
extraCmdArgs: [
"--tensor-parallel-size", "1",
"--block-size", "128",
"--max-num-seqs", "256",
"--max-seq_len-to-capture", "2048"
Expand Down

0 comments on commit 26efce2

Please sign in to comment.