tenstorrent · tstescoTT · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,7 @@ __pycache__
 env
 .testvenv
 python_env
-.venv
+.venv*
 
 # persistent storage volume
 persistent_volume

diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -36,3 +36,78 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 - `--max_seqs_in_batch` (default: `32`):
   - **Maximum batch size** for inference, determining the number of prompts processed in parallel.
 
+### Online Benchmarking
+
+#### single user
+
+```bash
+python utils/prompt_client_cli.py \
+    --num_prompts 32 \
+    --batch_size 1 \
+    --tokenizer_model meta-llama/Llama-3.1-70B-Instruct \
+    --max_prompt_length 128 \
+    --input_seq_len 128 \
+    --output_seq_len 128 \
+    --template chat_template \
+    --dataset random
+```
+
+#### using vllm/benchmarking/benchmark_serving.py
+Within the Docker container, use the benchmark_serving.patch file:
+```
+cd ~/app/src
+python run_vllm_api_server.py
+```
+This simply stops the benchmarking script from sending the `best_of` arg which is not supported and causes issues.
+
+To run the benchmarks, in another shell into the Docker container:
+```
+cd ~/vllm
+git apply ~/app/benchmarking/benchmark_serving.patch
+cd ~/app
+export PYTHONPATH=$PYTHONPATH:$PWD
+python benchmarking/vllm_online_benchmark.py
+```
+
+The output will be available for each input/output sequence length defined and time stamped.
+
+Results are also printed to stdout, for example with mock data results:
+```
+==================================================
+                    Benchmark Result                     
+==================================================
+Successful requests:                     32
+Benchmark duration (s):                  0.39
+Total input tokens:                      4096
+Total generated tokens:                  64
+Request throughput (req/s):              83.04
+Output token throughput (tok/s):         166.07
+Total Token throughput (tok/s):          10794.77
+--------------------------------------------------
+               Time to First Token                  
+--------------------------------------------------
+Mean TTFT (ms):                          358.26
+Median TTFT (ms):                        358.45
+P99 TTFT (ms):                           361.67
+--------------------------------------------------
+     Time per Output Token (excl. 1st token)       
+--------------------------------------------------
+Mean TPOT (ms):                          14.03
+Median TPOT (ms):                        14.13
+P99 TPOT (ms):                           14.30
+--------------------------------------------------
+             Inter-token Latency                   
+--------------------------------------------------
+Mean ITL (ms):                           7.86
+Median ITL (ms):                         7.83
+P99 ITL (ms):                            8.05
+==================================================
+```
+
+#### using tt-inference-server/benchmarking/prompt_client_online_benchmark.py
+
+```bash
+export PYTHONPATH=$PYTHONPATH:$PWD
+python benchmarking/prompt_client_online_benchmark.py
+```
+
diff --git a/benchmarking/benchmark_serving.patch b/benchmarking/benchmark_serving.patch
@@ -0,0 +1,40 @@
+diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
+index 4813fde2..0cb3e72e 100644
+--- a/benchmarks/backend_request_func.py
++++ b/benchmarks/backend_request_func.py
+@@ -235,9 +235,7 @@ async def async_request_openai_completions(
+             "model": request_func_input.model,
+             "prompt": request_func_input.prompt,
+             "temperature": 0.0,
+-            "best_of": request_func_input.best_of,
+             "max_tokens": request_func_input.output_len,
+-            "logprobs": request_func_input.logprobs,
+             "stream": True,
+             "ignore_eos": request_func_input.ignore_eos,
+         }
+diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
+index c1a396c8..74f75a15 100644
+--- a/benchmarks/benchmark_serving.py
++++ b/benchmarks/benchmark_serving.py
+@@ -22,6 +22,12 @@ On the client side, run:
+         --endpoint /generate_stream
+     to the end of the command above.
+ """
++import sys
++from unittest.mock import MagicMock
++# mock out ttnn fully so we can import ttnn without using it
++sys.modules["ttnn"] = MagicMock()
++sys.modules["ttnn.device"] = MagicMock()
++
+ import argparse
+ import asyncio
+ import base64
+@@ -417,7 +423,7 @@ async def benchmark(
+         prompt_len=test_prompt_len,
+         output_len=test_output_len,
+         logprobs=logprobs,
+-        best_of=best_of,
++        best_of=None,
+         multi_modal_content=test_mm_content,
+         ignore_eos=ignore_eos,
+     )